Auteurs: Alexandre Gramfort, Joseph Salmon
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
np.random.seed(42)
digits = load_digits()
X = digits.data
y = digits.target
n_samples, n_features = X.shape
n_digits = len(np.unique(y))
sample_size = 300
print("n_digits: %d, \t n_samples %d, \t n_features %d"
% (n_digits, n_samples, n_features))
digit = X[0]
plt.imshow(digit.reshape(8, 8), cmap=plt.cm.gray, interpolation="nearest")
digit.shape
km = KMeans(n_clusters=2)
km.fit(digit.reshape(64, 1))
plt.imshow(km.labels_.reshape(8, 8), cmap=plt.cm.gray, interpolation="nearest")
km = KMeans(init='k-means++', n_clusters=10)
km.fit(X)
# Fonction d'affichage d'une liste d'image
def disp_pics(pic_list, title=''):
"""" Fonction qui affiche une liste d'image codée en vecteur """""
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(12, 4))
plt.suptitle(title, fontsize=16)
for i in range(10):
opt = dict(cmap='gray', aspect='equal', interpolation='nearest')
axs.flat[i].imshow(pic_list[i].reshape(8, 8), **opt)
axs.flat[i].set_title("Image: " + str(i))
# Contre-balancer l'affichage pas terrible de matplotlib
plt.tight_layout()
plt.subplots_adjust(top=0.85)
# Affichage des centres des classes (centroides) obtenus par kmeans
disp_pics(km.cluster_centers_, title=(u"Visualisation de l'ensemble des centroides"))
centers_with_known_labels = [np.mean(X[y == cls], axis=0) for cls in range(10)]
# Affichage des images moyennes par classe pour les données
disp_pics(centers_with_known_labels, title=(u"Moyennes sur l'ensemble des données"))
from time import time
from sklearn import metrics
from sklearn.decomposition import PCA
sns.set(style="white")
X_2d = PCA(n_components=2).fit_transform(X)
plt.plot(X_2d[:, 0], X_2d[:, 1], 'k.', markersize=8)
sns.set_palette("hls", 10)
for k in range(10):
Xk_2d = X_2d[y == k]
plt.plot(Xk_2d[:, 0], Xk_2d[:, 1], '.', markersize=8, label=k)
plt.legend(numpoints=1,loc=1, bbox_to_anchor=(1.2, 0.7), markerscale=3)
km = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
km.fit(Xk_2d)
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = X_2d[:, 0].min() + 1, X_2d[:, 0].max() - 1
y_min, y_max = X_2d[:, 1].min() + 1, X_2d[:, 1].max() - 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = km.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(X_2d[:, 0], X_2d[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = km.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title(u"K-means clustering sur digits en 2D après PCA-reduced data)\n"
u"Centres marqués d'une croix blanche")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()