# DBSCAN Clustering Algorithm in Scikit-learn

Finds core samples of high density and expands clusters from them.

### Version¶

In [1]:
import sklearn
sklearn.__version__

Out[1]:
'0.18'

### Imports¶

This tutorial imports DBSCAN, make_blobs and StandardScaler.

In [2]:
print(__doc__)

import plotly.plotly as py
import plotly.graph_objs as go

import matplotlib.pyplot as plt
import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

Automatically created module for IPython interactive environment


### Calculations¶

Generate sample data

In [3]:
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
random_state=0)

X = StandardScaler().fit_transform(X)


Compute DBSCAN

In [4]:
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
print("Adjusted Mutual Information: %0.3f"
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))

Estimated number of clusters: 3
Homogeneity: 0.953
Completeness: 0.883
V-measure: 0.917
Adjusted Rand Index: 0.952
Adjusted Mutual Information: 0.883
Silhouette Coefficient: 0.626


### Plot Results¶

Convert Matplotlib Colormap to plotly

In [5]:
def matplotlib_to_plotly(cmap, pl_entries):
h = 1.0/(pl_entries-1)
pl_colorscale = []

for k in range(pl_entries):
C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])

return pl_colorscale

In [6]:
unique_labels = set(labels)

colors = matplotlib_to_plotly(plt.cm.Spectral, len(unique_labels))
data = []

for k, col in zip(unique_labels, colors):

if k == -1:
# Black used for noise.
col = 'black'
else:
col = col[1]

class_member_mask = (labels == k)

trace1 = go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='markers',
marker=dict(color=col, size=14,
line=dict(color='black', width=1)))

trace2 = go.Scatter(x=xy[:, 0], y=xy[:, 1], mode='markers',
marker=dict(color=col, size=14,
line=dict(color='black', width=1)))
data.append(trace1)
data.append(trace2)

layout = go.Layout(showlegend=False,
title='Estimated number of clusters: %d' % n_clusters_,
xaxis=dict(showgrid=False, zeroline=False),
yaxis=dict(showgrid=False, zeroline=False))
fig = go.Figure(data=data, layout=layout)

py.iplot(fig)

Out[6]:
