Scikit-Learn最新更新简介

加载中...

Scikit-Learn最新更新简介 | DataLearnerAI

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42))
]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

clf.fit(X_train, y_train).score(X_test, y_test)

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.datasets import make_classification

# Use Sklearn make classification to create a dummy dataset with 3 important variables out of 7
X, y = make_classification(random_state=0, n_features=7, n_informative=3)
rf = RandomForestClassifier(random_state=0).fit(X, y)

result = permutation_importance(rf, X, y, 
                                n_repeats=10, # Number of times for which each feature must be shuffled
                                random_state=0, # random state fixing for reproducability
                                n_jobs=-1) # Parallel processing using all cores

fig, ax = plt.subplots()
sorted_idx = result.importances_mean.argsort()

ax.boxplot(result.importances[sorted_idx].T,
           vert=False, labels=range(X.shape[1]))

ax.set_title("Permutation Importance of each feature")
ax.set_ylabel("Features")

fig.tight_layout()
plt.show()

import numpy as np
from sklearn.impute import KNNImputer
X = [[4, 6, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 9]]
imputer = KNNImputer(n_neighbors=2)
print(imputer.fit_transform(X))

from sklearn.ensemble import RandomForestClassifier 
from sklearn.datasets import make_classification

X, y = make_classification(random_state=0)

rf = RandomForestClassifier(random_state=0, ccp_alpha=0).fit(X, y)
print("Average number of nodes without pruning {:.1f}".format(
    np.mean([e.tree_.node_count for e in rf.estimators_])))

rf = RandomForestClassifier(random_state=0, ccp_alpha=0.1).fit(X, y)
print("Average number of nodes with pruning {:.1f}".format(
    np.mean([e.tree_.node_count for e in rf.estimators_])))

Scikit-Learn最新更新简介

DataLearner 官方微信

Scikit-Learn历史简介

Scikit-Learn v0.22新特性

基于Stacking的分类/回归算法

基于排列的特征重要性计算

多分类问题支持ROC-AUC计算

基于kNN方法的缺失值填补

对树进行剪枝

热门博客