Show Sidebar Hide Sidebar

# Comparison of the K-Means and MiniBatchKMeans clustering algorithms in Scikit-learn

Note: this page is part of the documentation for version 3 of Plotly.py, which is not the most recent version.
See our Version 4 Migration Guide for information about how to upgrade.

We want to compare the performance of the MiniBatchKMeans and KMeans: the MiniBatchKMeans is faster, but gives slightly different results (see Mini Batch K-Means).

We will cluster a set of data, first with KMeans and then with MiniBatchKMeans, and plot the results. We will also plot the points that are labelled differently between the two algorithms.

#### New to Plotly?¶

Plotly's Python library is free and open source! Get started by downloading the client and reading the primer.
You can set up Plotly to work in online or offline mode, or in jupyter notebooks.
We also have a quick-reference cheatsheet (new!) to help you get started!

### Version¶

In :
import sklearn
sklearn.__version__

Out:
'0.18'

### Imports¶

In :
print(__doc__)

import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools

import time
import numpy as np

from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs

Automatically created module for IPython interactive environment


### Calculations¶

Generate sample data.

In :
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7)


Compute clustering with KMeans

In :
k_means = KMeans(init='k-means++', n_clusters=3, n_init=10)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0


Compute clustering with MiniBatchKMeans

In :
mbk = MiniBatchKMeans(init='k-means++', n_clusters=3, batch_size=batch_size,
n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0


### Plot Result¶

In :
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
k_means_cluster_centers = np.sort(k_means.cluster_centers_, axis=0)
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
mbk_means_labels = pairwise_distances_argmin(X, mbk_means_cluster_centers)
order = pairwise_distances_argmin(k_means_cluster_centers,
mbk_means_cluster_centers)

fig = tools.make_subplots(rows=1, cols=3,
print_grid=False,
subplot_titles=('KMeans<br>train time: %.2fs\ninertia: %f' %
(t_mini_batch, mbk.inertia_),
'MiniBatchKmeans<br>train time: %.2fs\ninertia: %f' %
(t_mini_batch, mbk.inertia_),
'Difference'))


### K Means¶

In :
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
kmeans1 = go.Scatter(x=X[my_members, 0], y=X[my_members, 1],
showlegend=False,
mode='markers', marker=dict(color=col, size=4))
kmeans2 = go.Scatter(x=[cluster_center], y=[cluster_center],
showlegend=False,
mode='markers', marker=dict(color=col, size=14,
line=dict(color='black',
width=1)))
fig.append_trace(kmeans1, 1, 1)
fig.append_trace(kmeans2, 1, 1)

fig['layout']['xaxis1'].update(showticklabels=False, ticks='',
zeroline=False, showgrid=False)
fig['layout']['yaxis1'].update(showticklabels=False, ticks='',
zeroline=False, showgrid=False)


### MiniBatchKMeans¶

In :
for k, col in zip(range(n_clusters), colors):
my_members = mbk_means_labels == order[k]
cluster_center = mbk_means_cluster_centers[order[k]]
minibatchkmeans1 = go.Scatter(x=X[my_members, 0], y=X[my_members, 1],
showlegend=False,
mode='markers', marker=dict(color=col, size=4))
minibatchkmeans2 = go.Scatter(x=[cluster_center], y=[cluster_center],
showlegend=False,
mode='markers', marker=dict(color=col, size=14,
line=dict(color='black',
width=1)))
fig.append_trace(minibatchkmeans1, 1, 2)
fig.append_trace(minibatchkmeans2, 1, 2)

fig['layout']['xaxis2'].update(showticklabels=False, ticks='',
zeroline=False, showgrid=False)
fig['layout']['yaxis2'].update(showticklabels=False, ticks='',
zeroline=False, showgrid=False)


### Difference¶

In :
# Initialise the different array to all False
different = (mbk_means_labels == 4)

for k in range(n_clusters):
different += ((k_means_labels == k) != (mbk_means_labels == order[k]))

identic = np.logical_not(different)
difference1 = go.Scatter(x=X[identic, 0], y=X[identic, 1],
showlegend=False,
mode='markers', marker=dict(color='#bbbbbb', size=4))

difference2 = go.Scatter(x=X[different, 0], y=X[different, 1],
showlegend=False,
mode='markers', marker=dict(color='magenta', size=4))

fig.append_trace(difference1, 1, 3)
fig.append_trace(difference2, 1, 3)

fig['layout']['xaxis3'].update(showticklabels=False, ticks='',
zeroline=False, showgrid=False)
fig['layout']['yaxis3'].update(showticklabels=False, ticks='',
zeroline=False, showgrid=False)

In :
py.iplot(fig)

Out: 