import numpy as np
# import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import tree
import pydotplus

iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file="temptree.dot",
                                feature_names=iris.feature_names,
                                class_names=iris.target_names,
                                filled=True, rounded=True,
                                special_characters=True)


dot_file=r'.\temptree.dot'
graph = pydotplus.graph_from_dot_file(dot_file)


%matplotlib inline
from IPython.display import Image
#写到pdf文件
#graph.write_pdf("iris1.pdf")


Image(graph.create_png())


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report


from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(iris.data, iris.target,test_size=0.33,random_state=10)  
print(X_train.shape)
print(X_test.shape)

(100, 4)
(50, 4)


from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_train)
#print(X_trained_scaled)

print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[-1.98285832e-15  1.39013800e-15 -3.99680289e-17  1.02140518e-16]
[1. 1. 1. 1.]


scaler = preprocessing.StandardScaler().fit(X_train) 
X_train_scaled = scaler.transform(X_train) 
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0)) 
X_test_scaled = scaler.transform(X_test) 
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[-1.98285832e-15  1.39013800e-15 -3.99680289e-17  1.02140518e-16]
[1. 1. 1. 1.]
[ 0.04139649 -0.10980216  0.1107187   0.1443679 ]
[1.01388863 0.96108477 0.96033748 0.98774695]


from sklearn.pipeline import Pipeline
pipeline = Pipeline([("scal",preprocessing.StandardScaler()), ('clf',tree.DecisionTreeClassifier(criterion='gini'))])


pipeline.get_params()

{'memory': None,
 'steps': [('scal', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('clf',
   DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, presort=False, random_state=None,
               splitter='best'))],
 'scal': StandardScaler(copy=True, with_mean=True, with_std=True),
 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'scal__copy': True,
 'scal__with_mean': True,
 'scal__with_std': True,
 'clf__class_weight': None,
 'clf__criterion': 'gini',
 'clf__max_depth': None,
 'clf__max_features': None,
 'clf__max_leaf_nodes': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_impurity_split': None,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__presort': False,
 'clf__random_state': None,
 'clf__splitter': 'best'}


GridSearchCV??

Object `GridSearchCV` not found.


from sklearn.model_selection import GridSearchCV
hyperparameters = { 
    'clf__max_depth': (2, 3, 4,5),  
    'clf__min_samples_split': ( 2,3,5),  
    'clf__min_samples_leaf': (1, 2,3)
}  
clf = GridSearchCV(pipeline, hyperparameters, cv=10,n_jobs=1,verbose=1)
clf.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    1.2s finished

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scal', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__max_depth': (2, 3, 4, 5), 'clf__min_samples_split': (2, 3, 5), 'clf__min_samples_leaf': (1, 2, 3)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)


clf.best_params_

{'clf__max_depth': 4, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}


print ('最佳效果：%0.3f' %clf.best_score_)  
print('最优参数')  
print(clf.best_params_)
predictions = clf.predict(X_test)

最佳效果：0.970
最优参数
{'clf__max_depth': 4, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}


print(classification_report(y_test, predictions)) 
predictions

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.94      0.79      0.86        19
          2       0.79      0.94      0.86        16

avg / total       0.91      0.90      0.90        50

array([1, 2, 0, 1, 0, 1, 2, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2,
       2, 0, 1, 0, 1, 1, 2, 2, 1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0,
       1, 0, 2, 2, 2, 2])


besttree=clf.best_estimator_.get_params()["clf"]
besttree.fit(X_train, y_train)
dot_data = tree.export_graphviz(besttree, out_file="temptree.dot",
                                feature_names=iris.feature_names,
                                class_names=iris.target_names,
                                filled=True, rounded=True,
                                special_characters=True)

dot_file=r'.\temptree.dot'
graph = pydotplus.graph_from_dot_file(dot_file)


Image(graph.create_png())


predictions = besttree.predict(X_test)  
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.94      0.79      0.86        19
          2       0.79      0.94      0.86        16

avg / total       0.91      0.90      0.90        50


data = pd.read_excel("default of credit card clients.xls", skiprows=[0])
data.head()


from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
import pydotplus
%matplotlib inline
from IPython.display import Image
y = data['default payment next month']
X = data.drop(["ID",'default payment next month'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                  test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)
pipeline = Pipeline([('clf',tree.DecisionTreeClassifier(criterion='gini'))]) 
hyperparameters = { 
    'clf__max_depth': (10, 15, 20),  
    'clf__min_samples_split': ( 2,10,20),  
    'clf__min_samples_leaf': (1,5,10),
    'clf__max_features': (5,10)
}  
clf = GridSearchCV(pipeline, hyperparameters, cv=10,n_jobs=1,verbose=1)
clf.fit(X_train, y_train)
besttree=clf.best_estimator_.get_params()["clf"]
besttree.fit(X_train, y_train)

C:\Users\he\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\Users\he\Anaconda3\lib\site-packages\sklearn\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

Fitting 10 folds for each of 54 candidates, totalling 540 fits

[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:  1.8min finished

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


dot_data = tree.export_graphviz(besttree, out_file="temptree.dot",
                                 feature_names=X_train.columns,
                                filled=True, rounded=True,
                                special_characters=True)

dot_file=r'.\temptree.dot'
graph = pydotplus.graph_from_dot_file(dot_file) 

graph.write_pdf("default.pdf")

True


predictions = besttree.predict(X_test)  
print(classification_report(y_test, predictions)) 
print(besttree.feature_importances_)

             precision    recall  f1-score   support

          0       0.83      0.94      0.88      4673
          1       0.62      0.32      0.42      1327

avg / total       0.78      0.81      0.78      6000

[0.03514316 0.         0.00652261 0.00628771 0.02154708 0.18420739
 0.39850398 0.01444067 0.03060112 0.01182362 0.03158272 0.04143561
 0.01872796 0.02544174 0.01437987 0.00919085 0.02192613 0.02377842
 0.01419559 0.02263757 0.03484195 0.01318168 0.01960257]


print('Best Score: {}'.format(clf.best_score_))

Best Score: 0.810125


predictions_prob = besttree.predict_proba(X_test)  
precision,recall,thresholds=precision_recall_curve(y_test,predictions_prob[:,1])
from sklearn.metrics import roc_curve 
fpr, tpr, thresholds = roc_curve(y_test, predictions_prob[:,1])


y_test[1:10],predictions_prob[:,1]

(29444    1
 12392    1
 5591     0
 7282     0
 18238    0
 18318    0
 22939    0
 16704    0
 25910    0
 Name: default payment next month, dtype: int64,
 array([0.1875    , 0.07821229, 0.78927911, ..., 0.09736124, 0.61728395,
        0.19664032]))


fig = plt.figure(figsize=(10, 5))
ax1 = fig.add_subplot(121,  ylabel="查全率",xlabel="查准率")
ax1.plot(precision,recall)
ax2 = fig.add_subplot(122,  ylabel="True positive rate",xlabel="False positive rate")
ax2.plot(fpr,tpr)

[<matplotlib.lines.Line2D at 0x1b093909390>]


from sklearn.ensemble import RandomForestClassifier
parameter_gridsearch = {
                 'max_depth' : [5,7],  #depth of each decision tree
                 'n_estimators': [50,40],  #count of decision tree      
       
                 }
randomforest = RandomForestClassifier()  
gridsearch = GridSearchCV(randomforest,            
                               param_grid=parameter_gridsearch
                                )


gridsearch.fit(X_train,y_train)  
 
parameters = gridsearch.best_params_
print(parameters)
print('Best Score: {}'.format(gridsearch.best_score_))
predictions = gridsearch.predict(X_test)  
print(classification_report(y_test, predictions))

C:\Users\he\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

{'max_depth': 7, 'n_estimators': 50}
Best Score: 0.8181666666666667
             precision    recall  f1-score   support

          0       0.83      0.96      0.89      4673
          1       0.69      0.32      0.44      1327

avg / total       0.80      0.82      0.79      6000


predictions_prob1 = gridsearch.predict_proba(X_test)  
precision1,recall1,thresholds=precision_recall_curve(y_test,predictions_prob1[:,1])
from sklearn.metrics import roc_curve 
fpr1, tpr1, thresholds = roc_curve(y_test, predictions_prob1[:,1])


plt.figure(figsize=(10, 5))
plt.subplot(121,  ylabel="查全率",xlabel="查准率",title="precision recall curve")
plt.plot(precision1,recall1,label="random forrest")
plt.plot(precision,recall,color="red",label="CART")
plt.legend(loc="best")
plt.subplot(122,  ylabel="True positive rate",xlabel="False positive rate",title="ROC curve")
plt.plot(fpr1,tpr1,label="random forrest")
plt.plot(fpr,tpr,color="red",label="CART")
plt.legend(loc="best")

<matplotlib.legend.Legend at 0x1b093ca9dd8>

参数	DecisionTreeClassifier
特征选择标准criterion	可以使用"gini"或者"entropy"，前者代表基尼系数，后者代表信息增益。一般来说使用默认值基尼系数"gini"就可以了，即CART算法。除非你更喜欢类似ID3、C4.5的最优特征选择方法。
特征划分点选择标准splitter	可以使用"best"或者"random"。前者在特征的所有划分点中找出最优的划分点。后者是随机的在部分划分点中找局部最优的划分点。默认值"best"适合样本量不大的时候，而如果样本数据量非常大，此时决策树构建推荐"random"。
划分时考虑的最大特征数max_features	可以使用很多种类型的值，默认是"None"，意味着划分时考虑所有的特征数；如果是"log2"意味着划分时最多考虑 $\log_2 N$个特征；如果是"sqrt"或者"auto"意味着划分时最多考虑$\sqrt{N}$个特征。如果是整数，代表考虑的特征绝对数。如果是浮点数，代表考虑特征百分比，即考虑（百分比$\times$N）取整后的特征数。其中N为样本总特征数。一般来说，如果样本特征数不多，比如小于50，我们用默认值"None"就可以了，如果特征数非常多，我们可以灵活使用刚才描述的其它取值来控制划分时考虑的最大特征数，以控制决策树的生成时间。
决策树最大深度max_depth	决策树的最大深度，默认可以不输入，如果不输入的话，决策树在建立子树的时候不会限制子树的深度。一般来说，数据少或者特征少的时候可以不管这个值。如果模型样本量多，特征也多的情况下，推荐限制这个最大深度，具体的取值取决于数据的分布。常用的可以取值10-100之间。
内部节点再划分所需最小样本数min_samples_split	这个值限制了子树继续划分的条件，如果某节点的样本数少于min_samples_split，则不会继续再尝试选择最优特征来进行划分。默认是2，如果样本量不大，不需要管这个值。如果样本量非常大，则推荐增大这个值。样本量有10万左右，建立决策树时，min_samples_split的参考值为10。
叶子节点最少样本数min_samples_leaf	这个值限制了叶子节点最少的样本数，如果某叶子节点数目小于样本数，则会和兄弟节点一起被剪枝。默认值是1，可以输入最少的样本数的整数，或者最少样本数占样本总数的百分比。如果样本量不大，不需要管这个值。如果样本量非常大，则推荐增大这个值。10万样本时min_samples_leaf的参考值为5。
叶子节点最小的样本权重和min_weight_fraction_leaf	这个值限制了叶子节点所有样本权重和的最小值，如果小于这个值，则会和兄弟节点一起被剪枝。默认值是0，就是不考虑权重问题。一般来说，如果较多样本有缺失值，或者分类树样本的分布类别偏差很大，就会引入样本权重。
最大叶子节点数max_leaf_nodes	通过限制最大叶子节点数，可以防止过拟合，默认是"None”，即不限制最大的叶子节点数。如果增加了该限制，算法会建立在最大叶子节点数内最优的决策树。如果特征不多，可以不考虑这个值，但是如果特征非常多的话，可以加以限制，具体的值可以通过交叉验证得到。
类别权重class_weight	指定样本各类别的权重，主要是为了防止训练集某些类别的样本过多，导致训练的决策树过于偏向这些类别。这里可以自己指定各个样本的权重或者用"balanced"，如果使用"balanced"，则算法会自己计算权重，样本量少的类别所对应的样本权重会高。当然，如果你的样本类别分布没有明显的偏倚，则可以不管这个参数，选择默认值"None"即可。
节点划分最小不纯度min_impurity_split	这个值限制了决策树的增长，如果某节点的不纯度（基尼系数，信息增益，均方差，绝对差）小于这个阈值，则该节点不再生成子节点，即为叶子节点。
数据是否预排序presort	这个值是布尔值，默认值是False即不进行预排序。一般来说，如果样本量少或者限制了一个深度很小的决策树，设置为true可以让划分点选择更加快，加快决策树的建立。如果样本量太大的话，反而没有什么好处。问题是样本量少的时候，速度本来就不慢，所以这个值一般不理它就可以了。

	ID	LIMIT_BAL	SEX	EDUCATION	MARRIAGE	AGE	PAY_0	PAY_2	PAY_3	PAY_4	...	BILL_AMT4	BILL_AMT5	BILL_AMT6	PAY_AMT1	PAY_AMT2	PAY_AMT3	PAY_AMT4	PAY_AMT5	PAY_AMT6	default payment next month
0	1	20000	2	2	1	24	2	2	-1	-1	...	0	0	0	0	689	0	0	0	0	1
1	2	120000	2	2	2	26	-1	2	0	0	...	3272	3455	3261	0	1000	1000	1000	0	2000	1
2	3	90000	2	2	2	34	0	0	0	0	...	14331	14948	15549	1518	1500	1000	1000	1000	5000	0
3	4	50000	2	2	1	37	0	0	0	0	...	28314	28959	29547	2000	2019	1200	1100	1069	1000	0
4	5	50000	1	2	1	57	-1	0	-1	0	...	20940	19146	19131	2000	36681	10000	9000	689	679	0

Machine Learning2 [CHI]

Scikit-Learn库--决策树和随机森林¶

分类回归的基本概念¶

划分准则¶

决策树的算法¶

决策树实例¶

DecisionTreeClassifier¶

scikit-learn 决策树算法类库¶

优化和评估模型¶

数据分割为training data 和 test data¶

数据标准化¶

超参数¶

CV¶

机器学习中为什么需要CV方法?¶

建模过程¶

随机森林 random forrest¶