Getting reliable K in K means clustering
Here I am sharing the codes for deriving the number of clusters in K means clustering algorithm as shown by Bhavesh Bhatt in his youtube video.
import math
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import warnings
sns.set_color_codes()
sns.set_context("poster")
warnings.filterwarnings("ignore")
np.random.seed(8)
a = np.random.multivariate_normal([10,0],[[3,1],[1,4]], size=[100,])
b = np.random.multivariate_normal([0,20],[[3,1],[1,4]], size=[100,])
c = np.random.multivariate_normal([20,30],[[3,1],[1,4]], size=[100,])
X = np.concatenate((a,b,c))
print X.shape
## (300, 2)
fig = plt.figure(figsize=(15, 10))
plt.xlim(-5,35)
plt.ylim(-5,35)
plt.scatter(X[:,0],X[:,1], c='b', s=5)
dist_points_from_cluster_center = []
K = range(1,10)
for no_of_clusters in K:
k_model = KMeans(n_clusters=no_of_clusters) ;
k_model.fit(X) ;
dist_points_from_cluster_center.append(k_model.inertia_) ;
print dist_points_from_cluster_center
## [70403.82188589728, 27407.17034387677, 2311.4033586287333, 1976.5540367939961, 1643.8735323124279, 1337.0526214543424, 1166.12005389885, 1036.87136535438, 942.9775249201616]
fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K, dist_points_from_cluster_center)
plt.xlabel("No. of clusters K")
plt.ylabel("Sum of squared distance")
fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K,dist_points_from_cluster_center)
plt.plot([K[0], K[8]], [dist_points_from_cluster_center[0],
dist_points_from_cluster_center[8]], 'ro-')
plt.xlabel("No. of clusters K")
plt.ylabel("Sum of squared distance")
# Function to find distance
# between a point and a line in 2-d
def calc_distance(x1,y1,a,b,c):
return abs((a*x1+b*y1+c)) / (math.sqrt(a*a+b*b))
Solving linear equation
https://bobobobo.wordpress.com/2008/01/07/solving-linear-equations-ax-by-c-0/
a = dist_points_from_cluster_center[0] - dist_points_from_cluster_center[8]
b = K[8] - K[0]
c1 = K[0] * dist_points_from_cluster_center[8]
c2 = K[8] * dist_points_from_cluster_center[0]
c = c1-c2
distance_of_points_from_line = []
for k in range(9):
distance_of_points_from_line.append(
calc_distance(K[k], dist_points_from_cluster_center[k],a,b,c))
fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K, distance_of_points_from_line)
print "Optimum value of k = " + str(distance_of_points_from_line.index(max(distance_of_points_from_line))+1)
## Optimum value of k = 3