Getting reliable K in K means clustering

Last updated on Oct 2, 2019 2 min read Machine Learning, Python

Here I am sharing the codes for deriving the number of clusters in K means clustering algorithm as shown by Bhavesh Bhatt in his youtube video.

import math
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns 
from sklearn.cluster import KMeans
import warnings

sns.set_color_codes()
sns.set_context("poster")
warnings.filterwarnings("ignore")

np.random.seed(8)
a = np.random.multivariate_normal([10,0],[[3,1],[1,4]], size=[100,])
b = np.random.multivariate_normal([0,20],[[3,1],[1,4]], size=[100,])
c = np.random.multivariate_normal([20,30],[[3,1],[1,4]], size=[100,])
X = np.concatenate((a,b,c))
print X.shape

## (300, 2)

fig = plt.figure(figsize=(15, 10))
plt.xlim(-5,35)

plt.ylim(-5,35)

plt.scatter(X[:,0],X[:,1], c='b', s=5)

dist_points_from_cluster_center = []
K = range(1,10)
for no_of_clusters in K:
    k_model = KMeans(n_clusters=no_of_clusters) ;
    k_model.fit(X) ;
    dist_points_from_cluster_center.append(k_model.inertia_) ;

print dist_points_from_cluster_center

## [70403.82188589728, 27407.17034387677, 2311.4033586287333, 1976.5540367939961, 1643.8735323124279, 1337.0526214543424, 1166.12005389885, 1036.87136535438, 942.9775249201616]

fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K, dist_points_from_cluster_center)
plt.xlabel("No. of clusters K")
plt.ylabel("Sum of squared distance")

fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K,dist_points_from_cluster_center)
plt.plot([K[0], K[8]], [dist_points_from_cluster_center[0],
                       dist_points_from_cluster_center[8]], 'ro-')
plt.xlabel("No. of clusters K")
plt.ylabel("Sum of squared distance")

# Function to find distance
# between a point and a line in 2-d

def calc_distance(x1,y1,a,b,c):
    return abs((a*x1+b*y1+c)) / (math.sqrt(a*a+b*b))

Solving linear equation

https://bobobobo.wordpress.com/2008/01/07/solving-linear-equations-ax-by-c-0/

a = dist_points_from_cluster_center[0] - dist_points_from_cluster_center[8]
b = K[8] - K[0]
c1 = K[0] * dist_points_from_cluster_center[8]
c2 = K[8] * dist_points_from_cluster_center[0]
c = c1-c2

distance_of_points_from_line = []
for k in range(9):
    distance_of_points_from_line.append(
    calc_distance(K[k], dist_points_from_cluster_center[k],a,b,c))

fig = plt.figure(figsize=(15, 10))
plt.grid()
plt.plot(K, distance_of_points_from_line)

print "Optimum value of k = " + str(distance_of_points_from_line.index(max(distance_of_points_from_line))+1)

## Optimum value of k = 3

python Statistics

Puneet Sharma

Research Scholar

My research interests include cloud & aerosol modeling and statistics.

Getting reliable K in K means clustering

Solving linear equation

Puneet Sharma

Research Scholar

Related