Note
Go to the end to download the full example code.
Hands-On: Validation of Supervised Classification Pipelines¶
Imports¶
# System
import os
import os.path
import tempfile
import time
# Scientific python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Univariate statistics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
# Dataset
from sklearn.datasets import make_classification
# Models
from sklearn.decomposition import PCA
import sklearn.linear_model as lm
import sklearn.svm as svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Metrics
import sklearn.metrics as metrics
# Resampling
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
# from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
Settings¶
Input/Output and working directory
WD = os.path.join(tempfile.gettempdir(), "ml_supervised_classif")
os.makedirs(WD, exist_ok=True)
INPUT_DIR = os.path.join(WD, "data")
OUTPUT_DIR = os.path.join(WD, "models")
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
Validation scheme here cross_validation
n_splits_test = 5
cv_test = StratifiedKFold(n_splits=n_splits_test, shuffle=True, random_state=42)
n_splits_val = 5
cv_val = StratifiedKFold(n_splits=n_splits_val, shuffle=True, random_state=42)
metrics_names = ['accuracy', 'balanced_accuracy', 'roc_auc']
Dataset¶
X, y = make_classification(n_samples=200, n_features=100,
n_informative=10, n_redundant=10,
random_state=1)
Models¶
mlp_param_grid = {"hidden_layer_sizes":
[(100, ), (50, ), (25, ), (10, ), (5, ), # 1 hidden layer
(100, 50, ), (50, 25, ), (25, 10,), (10, 5, ), # 2 hidden layers
(100, 50, 25, ), (50, 25, 10, ), (25, 10, 5, )], # 3 hidden layers
"activation": ["relu"], "solver": ["sgd"], 'alpha': [0.0001]}
models = dict(
lrl2_cv=make_pipeline(
preprocessing.StandardScaler(),
# preprocessing.MinMaxScaler(),
GridSearchCV(lm.LogisticRegression(),
{'C': 10. ** np.arange(-3, 1)},
cv=cv_val, n_jobs=n_splits_val)),
lrenet_cv=make_pipeline(
preprocessing.StandardScaler(),
# preprocessing.MinMaxScaler(),
GridSearchCV(estimator=lm.SGDClassifier(loss='log_loss',
penalty='elasticnet'),
param_grid={'alpha': 10. ** np.arange(-1, 3),
'l1_ratio': [.1, .5, .9]},
cv=cv_val, n_jobs=n_splits_val)),
svmrbf_cv=make_pipeline(
# preprocessing.StandardScaler(),
preprocessing.MinMaxScaler(),
GridSearchCV(svm.SVC(),
# {'kernel': ['poly', 'rbf'], 'C': 10. ** np.arange(-3, 3)},
{'kernel': ['rbf'], 'C': 10. ** np.arange(-1, 2)},
cv=cv_val, n_jobs=n_splits_val)),
forest_cv=make_pipeline(
# preprocessing.StandardScaler(),
preprocessing.MinMaxScaler(),
GridSearchCV(RandomForestClassifier(random_state=1),
{"n_estimators": [10, 100]},
cv=cv_val, n_jobs=n_splits_val)),
gb_cv=make_pipeline(
preprocessing.MinMaxScaler(),
GridSearchCV(estimator=GradientBoostingClassifier(random_state=1),
param_grid={"n_estimators": [10, 100]},
cv=cv_val, n_jobs=n_splits_val)),
mlp_cv=make_pipeline(
# preprocessing.StandardScaler(),
preprocessing.MinMaxScaler(),
GridSearchCV(estimator=MLPClassifier(random_state=1, max_iter=200, tol=0.0001),
param_grid=mlp_param_grid,
cv=cv_val, n_jobs=n_splits_val)))
Fit/Predict and Compute Test Score (CV)¶
Fit/predict models and return scores on folds using: cross_validate
Here we set:
return_estimator=True
to return the estimator fitted on each training setreturn_indices=True
to return the training and testing indices used to split the dataset into train and test sets for each cv split.
models_scores = dict()
for name, model in models.items():
# name, model = "lrl2_cv", models["lrl2_cv"]
start_time = time.time()
models_scores_ = cross_validate(estimator=model, X=X, y=y, cv=cv_test,
n_jobs=n_splits_test,
scoring=metrics_names,
return_estimator=True,
return_indices=True)
print(name, 'Elapsed time: \t%.3f sec' % (time.time() - start_time))
models_scores[name] = models_scores_
lrl2_cv Elapsed time: 1.019 sec
lrenet_cv Elapsed time: 0.156 sec
svmrbf_cv Elapsed time: 0.095 sec
forest_cv Elapsed time: 1.110 sec
gb_cv Elapsed time: 1.737 sec
mlp_cv Elapsed time: 15.332 sec
Average Test Scores (CV) and save it to a file¶
test_stat = [[name] + [res["test_" + metric].mean() for metric in metrics_names]
for name, res in models_scores.items()]
test_stat = pd.DataFrame(test_stat, columns=["model"]+metrics_names)
test_stat.to_csv(os.path.join(OUTPUT_DIR, "test_stat.csv"))
print(test_stat)
model accuracy balanced_accuracy roc_auc
0 lrl2_cv 0.765 0.765 0.81000
1 lrenet_cv 0.730 0.730 0.82200
2 svmrbf_cv 0.740 0.740 0.83500
3 forest_cv 0.800 0.800 0.85675
4 gb_cv 0.795 0.795 0.86100
5 mlp_cv 0.575 0.575 0.62100
Retrieve Individuals Predictions¶
1. Retrieve individuals predictions and save individuals predictions in csv file
# Iterate over models
predictions = pd.DataFrame()
for name, model in models_scores.items():
# name, model = "lrl2_cv", models_scores["lrl2_cv"]
# model_scores = models_scores["lrl2_cv"]
pred_vals_test = np.full(y.shape, np.nan) # Predicted values before threshold
pred_vals_train = np.full(y.shape, np.nan) # Predicted values before threshold
pred_labs_test = np.full(y.shape, np.nan) # Predicted labels
pred_labs_train = np.full(y.shape, np.nan) # Predicted labels
true_labs = np.full(y.shape, np.nan) # True labels
fold_nb = np.full(y.shape, np.nan) # True labels
# Iterate over folds
for fold in range(len(model['estimator'])):
est = model['estimator'][fold]
test_idx = model['indices']['test'][fold]
train_idx = model['indices']['train'][fold]
X_test = X[test_idx]
X_train = X[train_idx]
# Predicted labels
pred_labs_test[test_idx] = est.predict(X_test)
pred_labs_train[train_idx] = est.predict(X_train)
fold_nb[test_idx] = fold
# Predicted values before threshold
try:
pred_vals_test[test_idx] = est.predict_proba(X_test)[:, 1]
pred_vals_train[train_idx] = est.predict_proba(X_train)[:, 1]
except AttributeError:
pred_vals_test[test_idx] = est.decision_function(X_test)
pred_vals_train[train_idx] = est.decision_function(X_train)
true_labs[test_idx] = y[test_idx]
predictions_ = pd.DataFrame(dict(model=name, fold=fold_nb.astype(int),
pred_vals_test=pred_vals_test,
pred_labs_test=pred_labs_test.astype(int),
true_labs=y))
assert np.all(true_labs == y)
predictions = pd.concat([predictions, predictions_])
predictions.to_csv(os.path.join(OUTPUT_DIR, "predictions.csv"))
2. Recompute scores from saved predictions
models_scores_cv = [[mod, fold,
metrics.balanced_accuracy_score(df["true_labs"], df["pred_labs_test"]),
metrics.roc_auc_score(df["true_labs"], df["pred_vals_test"])]
for (mod, fold), df in predictions.groupby(["model", "fold"])]
models_scores_cv = pd.DataFrame(models_scores_cv, columns=["model", "fold", 'balanced_accuracy', 'roc_auc'])
models_scores = models_scores_cv.groupby("model").mean()
models_scores = models_scores.drop("fold", axis=1)
print(models_scores)
balanced_accuracy roc_auc
model
forest_cv 0.800 0.85675
gb_cv 0.795 0.86100
lrenet_cv 0.730 0.82200
lrl2_cv 0.765 0.81000
mlp_cv 0.575 0.62100
svmrbf_cv 0.740 0.83500
Total running time of the script: (0 minutes 20.499 seconds)