# Cross-validation on diabetes Dataset Exercise in Scikit-learn

A tutorial exercise which uses cross-validation with linear models.

This exercise is used in the Cross-validated estimators part of the Model selection: choosing estimators and their parameters section of the A tutorial on statistical-learning for scientific data processing.

### Version¶

import sklearn
sklearn.__version__

'0.18.1'

### Imports¶

This tutorial imports LassoCV, Lasso, KFold and cross_val_score.

from __future__ import print_function
print(__doc__)

import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
from sklearn import datasets
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

### Calculations¶

diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]

lasso = Lasso(random_state=0)
alphas = np.logspace(-4, -0.5, 30)

scores = list()
scores_std = list()

n_folds = 3

for alpha in alphas:
lasso.alpha = alpha
this_scores = cross_val_score(lasso, X, y, cv=n_folds, n_jobs=1)
scores.append(np.mean(this_scores))
scores_std.append(np.std(this_scores))

scores, scores_std = np.array(scores), np.array(scores_std)


### Plot cross-validation with linear models¶

p1 = go.Scatter(x=alphas, y=scores,
mode='lines',
line=dict(color='blue'),
fill='tonexty'
)

# plot error lines showing +/- std. errors of the scores
std_error = scores_std / np.sqrt(n_folds)

p2 = go.Scatter(x=alphas, y=scores + std_error,
mode='lines',
line=dict(color='blue', dash='dash'),
)

p3 = go.Scatter(x=alphas, y=scores - std_error,
mode='lines',
line=dict(color='blue', dash='dash'),
fill='tonexty')

line = go.Scatter(y=[np.max(scores), np.max(scores)],
x=[min(alphas), max(alphas)],
mode='lines',
line=dict(color='black', dash='dash',
width=1),
)

layout = go.Layout(xaxis=dict(title='alpha', type='log'),
yaxis=dict(title='CV score +/- std error'),
showlegend=False
)
fig = go.Figure(data=[p2, p1, p3, line], layout=layout)

py.iplot(fig)

### Bonus Question¶

Bonus: how much can you trust the selection of alpha?

# To answer this question we use the LassoCV object that sets its alpha
# parameter automatically from the data by internal cross-validation (i.e. it
# performs cross-validation on the training data it receives).
# We use external cross-validation to see how much the automatically obtained
# alphas differ across different cross-validation folds.
lasso_cv = LassoCV(alphas=alphas, random_state=0)
k_fold = KFold(3)

"how much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")
for k, (train, test) in enumerate(k_fold.split(X, y)):
lasso_cv.fit(X[train], y[train])
print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
format(k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
print()
print("Answer: Not very much since we obtained different alphas for different")
print("subsets of the data and moreover, the scores for these alphas differ")
print("quite substantially.")

Answer to the bonus question: how much can you trust the selection of alpha?

Alpha parameters maximising the generalization score on different
subsets of the data:
[fold 0] alpha: 0.10405, score: 0.53573
[fold 1] alpha: 0.05968, score: 0.16278
[fold 2] alpha: 0.10405, score: 0.44437

Answer: Not very much since we obtained different alphas for different
subsets of the data and moreover, the scores for these alphas differ
quite substantially.

Still need help?