Discrete versus Real AdaBoost in Scikit-learn

This example is based on Figure 10.2 from Hastie et al 2009 [1] and illustrates the difference in performance between the discrete SAMME [2] boosting algorithm and real SAMME.R boosting algorithm. Both algorithms are evaluated on a binary classification task where the target Y is a non-linear function of 10 input features.

Discrete SAMME AdaBoost adapts based on errors in predicted class labels whereas real SAMME.R uses the predicted class probabilities.

Version¶

In [1]:
import sklearn
sklearn.__version__

Out[1]:
'0.18.1'

Imports¶

In [2]:
print(__doc__)

import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import zero_one_loss

Calculations¶

In [3]:
n_estimators = 400
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
learning_rate = 1.

X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)

X_test, y_test = X[2000:], y[2000:]
X_train, y_train = X[:2000], y[:2000]

dt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)
dt_stump.fit(X_train, y_train)
dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)

dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)
dt.fit(X_train, y_train)
dt_err = 1.0 - dt.score(X_test, y_test)

base_estimator=dt_stump,
learning_rate=learning_rate,
n_estimators=n_estimators,
algorithm="SAMME")

base_estimator=dt_stump,
learning_rate=learning_rate,
n_estimators=n_estimators,
algorithm="SAMME.R")

Out[3]:
AdaBoostClassifier(algorithm='SAMME.R',
base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best'),
learning_rate=1.0, n_estimators=400, random_state=None)

Plot Results¶

In [4]:
decision_stump = go.Scatter(x=[1, n_estimators], y=[dt_stump_err] * 2,
mode='lines',
line=dict(color='black'),
name='Decision Stump Error')

decision_tree = go.Scatter(x=[1, n_estimators], y=[dt_err] * 2,
mode='lines',
line=dict(color='black', dash='dash'),
name='Decision Tree Error')

dtest_error = go.Scatter(x=np.arange(n_estimators) + 1, y=ada_discrete_err,
mode='lines',
line=dict(color='red'),

dtrain_error = go.Scatter(x=np.arange(n_estimators) + 1, y=ada_discrete_err_train,
mode='lines',
line=dict(color='blue'),
)
rtest_error = go.Scatter(x=np.arange(n_estimators) + 1, y=ada_real_err,
mode='lines',
line=dict(color='orange'),
)
rtrain_error = go.Scatter(x=np.arange(n_estimators) + 1, y=ada_real_err_train,
mode='lines',
line=dict(color='green'),
)

layout = go.Layout(xaxis=dict(title='n_estimators'),
yaxis=dict(title='error rate')
)

data = [decision_stump, decision_tree, dtest_error, dtrain_error, rtest_error, rtrain_error]
fig = go.Figure(data=data, layout=layout)

In [5]:
py.iplot(fig)

Out[5]:

