Show Sidebar Hide Sidebar

Faces recognition example using eigenfaces and SVMs in Scikit-learn

The dataset used in this example is a preprocessed excerpt of the “Labeled Faces in the Wild”, aka LFW:

http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)

New to Plotly?

Plotly's Python library is free and open source! Get started by downloading the client and reading the primer.
You can set up Plotly to work in online or offline mode, or in jupyter notebooks.
We also have a quick-reference cheatsheet (new!) to help you get started!

Version

In [1]:
import sklearn
sklearn.__version__
Out[1]:
'0.18.1'

Imports

In [2]:
from __future__ import print_function

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import plotly
import numpy as np

from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC


print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
Automatically created module for IPython interactive environment

Calculations

Download the data, if not already on disk and load it as numpy arrays.

In [3]:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7

Split into a training set and a test set using a stratified k fold.

In [4]:
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled dataset): unsupervised feature extraction / dimensionality reduction.

In [5]:
n_components = 150

print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
Extracting the top 150 eigenfaces from 966 faces
done in 0.187s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.020s

Train a SVM classification model

In [6]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
Fitting the classifier to the training set
done in 32.157s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Quantitative evaluation of the model quality on the test set

In [7]:
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
Predicting people's names on the test set
done in 0.085s
                   precision    recall  f1-score   support

     Ariel Sharon       0.82      0.69      0.75        13
     Colin Powell       0.77      0.95      0.85        60
  Donald Rumsfeld       0.94      0.59      0.73        27
    George W Bush       0.88      0.94      0.91       146
Gerhard Schroeder       0.88      0.84      0.86        25
      Hugo Chavez       1.00      0.80      0.89        15
       Tony Blair       0.97      0.78      0.86        36

      avg / total       0.88      0.87      0.87       322

[[  9   4   0   0   0   0   0]
 [  1  57   0   2   0   0   0]
 [  0   3  16   7   1   0   0]
 [  1   7   0 137   1   0   0]
 [  0   1   0   2  21   0   1]
 [  0   1   0   1   1  12   0]
 [  0   1   1   6   0   0  28]]

Plots

In [8]:
def matplotlib_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []
    
    for k in range(pl_entries):
        C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])
        
    return pl_colorscale

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    subplot_title= []
    for i in range(n_row * n_col):
        subplot_title.append(titles[i])
        
    
    fig = tools.make_subplots(rows=n_row, cols=n_col, print_grid=False,
                          subplot_titles=tuple(subplot_title))
    trace_list=[ ]
    
    for i in range(n_row * n_col):
        trace = go.Heatmap(
                           z=images[i].reshape((h, w)),
                           showscale=False,
                           colorscale=matplotlib_to_plotly(plt.cm.gray, 300)
                          )
        trace_list.append(trace)
    for i in range(n_row):
        for j in range(n_col):
            fig.append_trace(trace_list[i*j], i+1,j+1)
    
    for i in map(str,range(1,13)):
        y = 'yaxis'+ i
        x = 'xaxis'+i
        fig['layout'][y].update(autorange='reversed',
                                   showticklabels=False, ticks='')
        fig['layout'][x].update(showticklabels=False, ticks='')
        
    fig['layout'].update(height=1000)
    return fig

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s<br>true:      %s' % (pred_name, true_name)
In [9]:
prediction_titles = [title(y_pred, y_test, target_names, i)
                     for i in range(y_pred.shape[0])]

py.iplot(plot_gallery(X_test, prediction_titles, h, w))
Out[9]:

Plot the gallery of the most significative eigenfaces

In [11]:
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
py.iplot(plot_gallery(eigenfaces, eigenface_titles, h, w))
Out[11]:
Still need help?
Contact Us

For guaranteed 24 hour response turnarounds, upgrade to a Developer Support Plan.