| | import random
|
| | import math
|
| | import sys
|
| | import statistics
|
| |
|
| | def mahattan_distance_one_dimension(x1,x2):
|
| | return abs(x1-x2)
|
| |
|
| | def minkowski_distance(x1, x2, power):
|
| | return ((abs(x1[0] - x2[0]))**power + (abs(x1[1] - x2[1])**power)) **(1.0/power)
|
| |
|
| | def chebyshev_distance(x1, x2, power=0):
|
| | return max(abs(x1[0]-x2[0]),abs(x1[1]-x2[1]))
|
| |
|
| | one_dimensional_data_points = [5,13,4,6,15,13,32,14,6,10,12,31,12,41,13]
|
| | data_points = [(1,7),(1,-11),(3,17),(7,18),(-8,4),(8,12),(11,-7),(12,14),(13,71),(-16,11),(13,1),(-9,2),(-5,3),(0,12)]
|
| |
|
| | def k_means_clustering(K,func,power=0):
|
| | print("==============================")
|
| | print("K: {0}".format(K))
|
| |
|
| |
|
| | C = random.sample(data_points,K)
|
| |
|
| |
|
| | iteration = 1000000
|
| |
|
| |
|
| | min_decrease_sse = 1e-5
|
| |
|
| |
|
| | previous_sse = float("inf")
|
| | current_iteration = 0
|
| |
|
| | while(True):
|
| | current_iteration +=1
|
| | current_sse = 0.0
|
| |
|
| |
|
| | cms_dict = {}
|
| | for c in C:
|
| | cms_dict[c] = []
|
| |
|
| | for x in data_points:
|
| | max_dist = float("inf")
|
| | closest = ()
|
| |
|
| | for c in C:
|
| | d= func(x,c,power)
|
| | if(d <= max_dist):
|
| | max_dist = d
|
| | closest = c
|
| |
|
| |
|
| | cms_dict[closest].append(x)
|
| |
|
| |
|
| | C = []
|
| | for cm in cms_dict:
|
| | cm_total_distance = 0.0
|
| | new_m_x = 0.0
|
| | new_m_y = 0.0
|
| |
|
| |
|
| | for x in cms_dict[cm]:
|
| | new_m_x += x[0]
|
| | new_m_y += x[1]
|
| | new_m_x /= len(cms_dict[cm])
|
| | new_m_y /= len(cms_dict[cm])
|
| | C.append((new_m_x,new_m_y))
|
| |
|
| |
|
| | for x in cms_dict[cm]:
|
| | cm_total_distance += func(x,(new_m_x,new_m_y),power)**2
|
| | current_sse += cm_total_distance
|
| |
|
| |
|
| | if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
|
| | print("Final SSE: {0}".format(current_sse))
|
| | print("Final Iteration: {0}".format(current_iteration))
|
| | i = 0
|
| | silhouetee_coefficent = 0.0
|
| |
|
| | for cm in cms_dict:
|
| | i+=1
|
| | abs = []
|
| | if(len(cms_dict[cm]) != 1):
|
| | if(K > 1):
|
| | m = 0
|
| | for xi in cms_dict[cm]:
|
| | total_distance = 0
|
| | for xj in cms_dict[cm]:
|
| | if(xi != xj):
|
| | total_distance += func(xi,xj,power)
|
| | ai = total_distance//(len(cms_dict[cm])-1)
|
| | bi = None
|
| | for cm2 in cms_dict:
|
| | if( cm !=cm2 ):
|
| | total_distance = 0
|
| | for xj in cms_dict[cm2]:
|
| | total_distance += func(xi,xj,power)
|
| | average = total_distance//len(cms_dict[cm2])
|
| | if(bi is None):
|
| | bi = average
|
| | else:
|
| | if(average < bi ):
|
| | bi = average
|
| | si = float(bi - ai) / max(ai,bi)
|
| | silhouetee_coefficent += si
|
| | dict ={}
|
| | dict["a{0}".format(m)] = ai
|
| | dict["b{0}".format(m)] = bi
|
| | dict["s{0}".format(m)] = si
|
| | abs.append(dict)
|
| | m+=1
|
| | else:
|
| | dict ={}
|
| | dict["a0"] = "Undefined"
|
| | dict["b0"] = "Undefined"
|
| | dict["s0"] = "0 by defintion"
|
| | abs.append(dict)
|
| | print("Cluster {0}: {1}".format(i,cms_dict[cm]))
|
| | print(abs)
|
| | print("------------------------------------------------")
|
| | if( K > 1):
|
| | print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
|
| | else:
|
| | print("Silhouetee Coefficent is not defined for K = 1")
|
| | break
|
| | else:
|
| | print("Current SSE: {0}".format(current_sse))
|
| | previous_sse = current_sse
|
| |
|
| |
|
| | def k_median_clustering(K):
|
| | print("==============================")
|
| | print("K: {0}".format(K))
|
| |
|
| |
|
| | C = random.sample(one_dimensional_data_points,K)
|
| |
|
| |
|
| | iteration = 1000000
|
| |
|
| |
|
| | min_decrease_se = 1e-5
|
| |
|
| |
|
| | previous_se = float("inf")
|
| | current_iteration = 0
|
| |
|
| | while(True):
|
| | current_iteration +=1
|
| | current_se = 0.0
|
| |
|
| |
|
| | cms_dict = {}
|
| | for c in C:
|
| | cms_dict[c] = []
|
| |
|
| | for x in one_dimensional_data_points:
|
| | max_dist = float("inf")
|
| | closest = None
|
| |
|
| | for c in C:
|
| | d = mahattan_distance_one_dimension(x,c,)
|
| | if(d <= max_dist):
|
| | max_dist = d
|
| | closest = c
|
| |
|
| |
|
| | cms_dict[closest].append(x)
|
| |
|
| |
|
| | C = []
|
| | for cm in cms_dict:
|
| | cm_total_distance = 0.0
|
| | new_m_x = 0.0
|
| |
|
| |
|
| | new_m_x = statistics.median(cms_dict[cm])
|
| | C.append(new_m_x)
|
| |
|
| |
|
| | for x in cms_dict[cm]:
|
| | cm_total_distance += mahattan_distance_one_dimension(x,(new_m_x))
|
| | current_se += cm_total_distance
|
| |
|
| |
|
| | if(previous_se - current_se <= min_decrease_se or current_iteration > iteration):
|
| | print("Final SSE: {0}".format(current_se))
|
| | print("Final Iteration: {0}".format(current_iteration))
|
| | i = 0
|
| | silhouetee_coefficent = 0.0
|
| |
|
| | for cm in cms_dict:
|
| | i+=1
|
| | abs = []
|
| | if(len(cms_dict[cm]) != 1):
|
| | if(K > 1):
|
| | m = 0
|
| | for xi in cms_dict[cm]:
|
| | total_distance = 0
|
| | for xj in cms_dict[cm]:
|
| | if(xi != xj):
|
| | total_distance += mahattan_distance_one_dimension(xi,xj)
|
| | ai = total_distance//(len(cms_dict[cm])-1)
|
| | bi = None
|
| | for cm2 in cms_dict:
|
| | if( cm !=cm2 ):
|
| | total_distance = 0
|
| | for xj in cms_dict[cm2]:
|
| | total_distance += mahattan_distance_one_dimension(xi,xj)
|
| | average = total_distance//len(cms_dict[cm2])
|
| | if(bi is None):
|
| | bi = average
|
| | else:
|
| | if(average < bi ):
|
| | bi = average
|
| | si = float(bi - ai) / max(ai,bi)
|
| | silhouetee_coefficent += si
|
| | dict ={}
|
| | dict["a{0}".format(m)] = ai
|
| | dict["b{0}".format(m)] = bi
|
| | dict["s{0}".format(m)] = si
|
| | abs.append(dict)
|
| | m+=1
|
| | else:
|
| | dict ={}
|
| | dict["a0"] = "Undefined"
|
| | dict["b0"] = "Undefined"
|
| | dict["s0"] = "0 by defintion"
|
| | abs.append(dict)
|
| | print("Cluster {0}: {1}, Median: {2}".format(i,cms_dict[cm],statistics.median(cms_dict[cm])))
|
| | print(abs)
|
| | print("------------------------------------------------")
|
| | if( K > 1):
|
| | print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent / len(data_points)))
|
| | else:
|
| | print("Silhouetee Coefficent is not defined for K = 1")
|
| | break
|
| | else:
|
| | print("Current SSE: {0}".format(current_se))
|
| | previous_se = current_se
|
| |
|
| | def k_medoids_clustering(K,func,power=0):
|
| | print("==============================")
|
| | print("K: {0}".format(K))
|
| |
|
| |
|
| | C = random.sample(data_points,K)
|
| |
|
| |
|
| | iteration = 1000000
|
| |
|
| |
|
| | min_decrease_sse = 1e-5
|
| |
|
| |
|
| | previous_sse = float("inf")
|
| | current_iteration = 0
|
| |
|
| | while(True):
|
| | current_iteration +=1
|
| | current_sse = 0.0
|
| |
|
| |
|
| | cms_dict = {}
|
| | for c in C:
|
| | cms_dict[c] = []
|
| |
|
| | for x in data_points:
|
| | max_dist = float("inf")
|
| | closest = ()
|
| |
|
| | for c in C:
|
| | d= func(x,c,power)
|
| | if(d <= max_dist):
|
| | max_dist = d
|
| | closest = c
|
| |
|
| |
|
| | cms_dict[closest].append(x)
|
| |
|
| |
|
| | C = []
|
| | for m in cms_dict:
|
| | max_dist = float("inf")
|
| | new_c = None
|
| |
|
| | for x1 in cms_dict[m]:
|
| | cm_total_distance = 0.0
|
| | for x2 in cms_dict[m]:
|
| | cm_total_distance += func(x1, x2, power)
|
| | if(cm_total_distance <= max_dist):
|
| | max_dist = cm_total_distance
|
| | new_c = x1
|
| | C.append(new_c)
|
| |
|
| |
|
| | cm_total_distance = 0.0
|
| | for x in cms_dict[m]:
|
| | cm_total_distance += func(x,new_c,power)**2
|
| | current_sse += cm_total_distance
|
| |
|
| |
|
| | if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
|
| | print("Final SSE: {0}".format(current_sse))
|
| | print("Final Iteration: {0}".format(current_iteration))
|
| | silhouetee_coefficent = 0.0
|
| | i = 0
|
| |
|
| | for cm in cms_dict:
|
| | i+=1
|
| | abs = []
|
| | if(len(cms_dict[cm]) != 1):
|
| | if(K > 1):
|
| | m = 0
|
| | for xi in cms_dict[cm]:
|
| | total_distance = 0
|
| | for xj in cms_dict[cm]:
|
| | if(xi != xj):
|
| | total_distance += func(xi,xj,power)
|
| | ai = total_distance//(len(cms_dict[cm])-1)
|
| | bi = None
|
| | for cm2 in cms_dict:
|
| | if( cm !=cm2 ):
|
| | total_distance = 0
|
| | for xj in cms_dict[cm2]:
|
| | total_distance += func(xi,xj,power)
|
| | average = total_distance//len(cms_dict[cm2])
|
| | if(bi is None):
|
| | bi = average
|
| | else:
|
| | if(average < bi ):
|
| | bi = average
|
| | si = float(bi - ai) / max(ai,bi)
|
| | silhouetee_coefficent += si
|
| | dict ={}
|
| | dict["a{0}".format(m)] = ai
|
| | dict["b{0}".format(m)] = bi
|
| | dict["s{0}".format(m)] = si
|
| | abs.append(dict)
|
| | m+=1
|
| | else:
|
| | dict ={}
|
| | dict["a0"] = "Undefined"
|
| | dict["b0"] = "Undefined"
|
| | dict["s0"] = "0 by defintion"
|
| | abs.append(dict)
|
| | print("Cluster {0}: {1}".format(i,cms_dict[cm]))
|
| | print(abs)
|
| | print("------------------------------------------------")
|
| | if( K > 1):
|
| | print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
|
| | else:
|
| | print("Silhouetee Coefficent is not defined for K = 1")
|
| | break
|
| |
|
| | else:
|
| | print("Current SSE: {0}".format(current_sse))
|
| | previous_sse = current_sse
|
| |
|
| |
|
| | def main(argv):
|
| | algo = {
|
| | "euclidean": lambda k,pow: k_means_clustering(k,minkowski_distance,2),
|
| | "minkowski": lambda k,pow: k_means_clustering(k,minkowski_distance,pow),
|
| | "chebyshev": lambda k,pow,: k_means_clustering(k,chebyshev_distance),
|
| | "median": lambda k,pow: k_median_clustering(k),
|
| | "medoids": lambda k,pow: k_medoids_clustering(k,minkowski_distance,pow)
|
| | }
|
| | for k in range(1,int(argv[1])+1):
|
| | algo[str(argv[0])](k,float(argv[1]) if (len(argv) > 2 and argv[2].replace('.','',1).isdigit()) else 2 )
|
| | print("===========================================================================")
|
| |
|
| | if __name__ == "__main__":
|
| | if(len(sys.argv) == 1):
|
| | print("Missing Arugments")
|
| | else:
|
| | main(sys.argv[1:]) |