clustering.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import math
3
+ import sys
4
+ import statistics
5
+
6
+ def mahattan_distance_one_dimension(x1,x2):
7
+ return abs(x1-x2)
8
+
9
+ def minkowski_distance(x1, x2, power):
10
+ return ((abs(x1[0] - x2[0]))**power + (abs(x1[1] - x2[1])**power)) **(1.0/power)
11
+
12
+ def chebyshev_distance(x1, x2, power=0):
13
+ return max(abs(x1[0]-x2[0]),abs(x1[1]-x2[1]))
14
+
15
+ one_dimensional_data_points = [5,13,4,6,15,13,32,14,6,10,12,31,12,41,13]
16
+ data_points = [(1,7),(1,-11),(3,17),(7,18),(-8,4),(8,12),(11,-7),(12,14),(13,71),(-16,11),(13,1),(-9,2),(-5,3),(0,12)]
17
+
18
+ def k_means_clustering(K,func,power=0):
19
+ print("==============================")
20
+ print("K: {0}".format(K))
21
+
22
+ # pick random k pointS
23
+ C = random.sample(data_points,K)
24
+
25
+ #prevent infinite loop criteria
26
+ iteration = 1000000
27
+
28
+ #converage criteria for minmumal decrease in sum of square errors
29
+ min_decrease_sse = 1e-5
30
+
31
+ #intital conditions for comparsions
32
+ previous_sse = float("inf")
33
+ current_iteration = 0
34
+
35
+ while(True):
36
+ current_iteration +=1
37
+ current_sse = 0.0
38
+
39
+ #create cluster membership dictonary for each centroid
40
+ cms_dict = {}
41
+ for c in C:
42
+ cms_dict[c] = []
43
+
44
+ for x in data_points:
45
+ max_dist = float("inf")
46
+ closest = ()
47
+ #compute the distance from x to each centroid
48
+ for c in C:
49
+ d= func(x,c,power)
50
+ if(d <= max_dist):
51
+ max_dist = d
52
+ closest = c
53
+
54
+ #assign x to the closet centroid and its cluster memberships
55
+ cms_dict[closest].append(x)
56
+
57
+ #recomputing new centroids
58
+ C = []
59
+ for cm in cms_dict:
60
+ cm_total_distance = 0.0
61
+ new_m_x = 0.0
62
+ new_m_y = 0.0
63
+
64
+ #recompute the centroids using the current cluster memberships
65
+ for x in cms_dict[cm]:
66
+ new_m_x += x[0]
67
+ new_m_y += x[1]
68
+ new_m_x /= len(cms_dict[cm])
69
+ new_m_y /= len(cms_dict[cm])
70
+ C.append((new_m_x,new_m_y))
71
+
72
+ #calucation the sum of squared error
73
+ for x in cms_dict[cm]:
74
+ cm_total_distance += func(x,(new_m_x,new_m_y),power)**2
75
+ current_sse += cm_total_distance
76
+
77
+ #getting the decrease value in the sse
78
+ if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
79
+ print("Final SSE: {0}".format(current_sse))
80
+ print("Final Iteration: {0}".format(current_iteration))
81
+ i = 0
82
+ silhouetee_coefficent = 0.0
83
+ #calculate average silhouetee coefficent
84
+ for cm in cms_dict:
85
+ i+=1
86
+ abs = []
87
+ if(len(cms_dict[cm]) != 1):
88
+ if(K > 1):
89
+ m = 0
90
+ for xi in cms_dict[cm]:
91
+ total_distance = 0
92
+ for xj in cms_dict[cm]:
93
+ if(xi != xj):
94
+ total_distance += func(xi,xj,power)
95
+ ai = total_distance//(len(cms_dict[cm])-1)
96
+ bi = None
97
+ for cm2 in cms_dict:
98
+ if( cm !=cm2 ):
99
+ total_distance = 0
100
+ for xj in cms_dict[cm2]:
101
+ total_distance += func(xi,xj,power)
102
+ average = total_distance//len(cms_dict[cm2])
103
+ if(bi is None):
104
+ bi = average
105
+ else:
106
+ if(average < bi ):
107
+ bi = average
108
+ si = float(bi - ai) / max(ai,bi)
109
+ silhouetee_coefficent += si
110
+ dict ={}
111
+ dict["a{0}".format(m)] = ai
112
+ dict["b{0}".format(m)] = bi
113
+ dict["s{0}".format(m)] = si
114
+ abs.append(dict)
115
+ m+=1
116
+ else:
117
+ dict ={}
118
+ dict["a0"] = "Undefined"
119
+ dict["b0"] = "Undefined"
120
+ dict["s0"] = "0 by defintion"
121
+ abs.append(dict)
122
+ print("Cluster {0}: {1}".format(i,cms_dict[cm]))
123
+ print(abs)
124
+ print("------------------------------------------------")
125
+ if( K > 1):
126
+ print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
127
+ else:
128
+ print("Silhouetee Coefficent is not defined for K = 1")
129
+ break
130
+ else:
131
+ print("Current SSE: {0}".format(current_sse))
132
+ previous_sse = current_sse
133
+
134
+
135
+ def k_median_clustering(K):
136
+ print("==============================")
137
+ print("K: {0}".format(K))
138
+
139
+ # pick random k pointS
140
+ C = random.sample(one_dimensional_data_points,K)
141
+
142
+ #prevent infinite loop criteria
143
+ iteration = 1000000
144
+
145
+ #converage criteria for minmumal decrease in sum of errors
146
+ min_decrease_se = 1e-5
147
+
148
+ #intital condiions for comparsions
149
+ previous_se = float("inf")
150
+ current_iteration = 0
151
+
152
+ while(True):
153
+ current_iteration +=1
154
+ current_se = 0.0
155
+
156
+ #create cluster membership dictonary for each median
157
+ cms_dict = {}
158
+ for c in C:
159
+ cms_dict[c] = []
160
+
161
+ for x in one_dimensional_data_points:
162
+ max_dist = float("inf")
163
+ closest = None
164
+ #compute the distance from x to each median
165
+ for c in C:
166
+ d = mahattan_distance_one_dimension(x,c,)
167
+ if(d <= max_dist):
168
+ max_dist = d
169
+ closest = c
170
+
171
+ #assign x to the closet median and its cluster memberships
172
+ cms_dict[closest].append(x)
173
+
174
+ #recomputing new centroids
175
+ C = []
176
+ for cm in cms_dict:
177
+ cm_total_distance = 0.0
178
+ new_m_x = 0.0
179
+
180
+ #recompute the median using the current cluster memberships
181
+ new_m_x = statistics.median(cms_dict[cm])
182
+ C.append(new_m_x)
183
+
184
+ #calucation the sum of error
185
+ for x in cms_dict[cm]:
186
+ cm_total_distance += mahattan_distance_one_dimension(x,(new_m_x))
187
+ current_se += cm_total_distance
188
+
189
+ #getting the decrease value in the sse
190
+ if(previous_se - current_se <= min_decrease_se or current_iteration > iteration):
191
+ print("Final SSE: {0}".format(current_se))
192
+ print("Final Iteration: {0}".format(current_iteration))
193
+ i = 0
194
+ silhouetee_coefficent = 0.0
195
+ #calculate average silhouetee coefficent
196
+ for cm in cms_dict:
197
+ i+=1
198
+ abs = []
199
+ if(len(cms_dict[cm]) != 1):
200
+ if(K > 1):
201
+ m = 0
202
+ for xi in cms_dict[cm]:
203
+ total_distance = 0
204
+ for xj in cms_dict[cm]:
205
+ if(xi != xj):
206
+ total_distance += mahattan_distance_one_dimension(xi,xj)
207
+ ai = total_distance//(len(cms_dict[cm])-1)
208
+ bi = None
209
+ for cm2 in cms_dict:
210
+ if( cm !=cm2 ):
211
+ total_distance = 0
212
+ for xj in cms_dict[cm2]:
213
+ total_distance += mahattan_distance_one_dimension(xi,xj)
214
+ average = total_distance//len(cms_dict[cm2])
215
+ if(bi is None):
216
+ bi = average
217
+ else:
218
+ if(average < bi ):
219
+ bi = average
220
+ si = float(bi - ai) / max(ai,bi)
221
+ silhouetee_coefficent += si
222
+ dict ={}
223
+ dict["a{0}".format(m)] = ai
224
+ dict["b{0}".format(m)] = bi
225
+ dict["s{0}".format(m)] = si
226
+ abs.append(dict)
227
+ m+=1
228
+ else:
229
+ dict ={}
230
+ dict["a0"] = "Undefined"
231
+ dict["b0"] = "Undefined"
232
+ dict["s0"] = "0 by defintion"
233
+ abs.append(dict)
234
+ print("Cluster {0}: {1}, Median: {2}".format(i,cms_dict[cm],statistics.median(cms_dict[cm])))
235
+ print(abs)
236
+ print("------------------------------------------------")
237
+ if( K > 1):
238
+ print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent / len(data_points)))
239
+ else:
240
+ print("Silhouetee Coefficent is not defined for K = 1")
241
+ break
242
+ else:
243
+ print("Current SSE: {0}".format(current_se))
244
+ previous_se = current_se
245
+
246
+ def k_medoids_clustering(K,func,power=0):
247
+ print("==============================")
248
+ print("K: {0}".format(K))
249
+
250
+ # pick random k pointS
251
+ C = random.sample(data_points,K)
252
+
253
+ #prevent infinite loop criteria
254
+ iteration = 1000000
255
+
256
+ #converage criteria for minmumal decrease in sum of square errors
257
+ min_decrease_sse = 1e-5
258
+
259
+ #intital conditions for comparsions
260
+ previous_sse = float("inf")
261
+ current_iteration = 0
262
+
263
+ while(True):
264
+ current_iteration +=1
265
+ current_sse = 0.0
266
+
267
+ #create cluster membership dictonary for each medoid
268
+ cms_dict = {}
269
+ for c in C:
270
+ cms_dict[c] = []
271
+
272
+ for x in data_points:
273
+ max_dist = float("inf")
274
+ closest = ()
275
+ #compute the distance from x to each medoid
276
+ for c in C:
277
+ d= func(x,c,power)
278
+ if(d <= max_dist):
279
+ max_dist = d
280
+ closest = c
281
+
282
+ #assign x to the closet centroid and its cluster memberships
283
+ cms_dict[closest].append(x)
284
+
285
+ #recomputing new medoid
286
+ C = []
287
+ for m in cms_dict:
288
+ max_dist = float("inf")
289
+ new_c = None
290
+ #recompute the medoids using the current cluster memberships
291
+ for x1 in cms_dict[m]:
292
+ cm_total_distance = 0.0
293
+ for x2 in cms_dict[m]:
294
+ cm_total_distance += func(x1, x2, power)
295
+ if(cm_total_distance <= max_dist):
296
+ max_dist = cm_total_distance
297
+ new_c = x1
298
+ C.append(new_c)
299
+
300
+ #calucation the sum of squared error
301
+ cm_total_distance = 0.0
302
+ for x in cms_dict[m]:
303
+ cm_total_distance += func(x,new_c,power)**2
304
+ current_sse += cm_total_distance
305
+
306
+ #getting the decrease value in the sse
307
+ if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
308
+ print("Final SSE: {0}".format(current_sse))
309
+ print("Final Iteration: {0}".format(current_iteration))
310
+ silhouetee_coefficent = 0.0
311
+ i = 0
312
+ #calculate average silhouetee coefficent
313
+ for cm in cms_dict:
314
+ i+=1
315
+ abs = []
316
+ if(len(cms_dict[cm]) != 1):
317
+ if(K > 1):
318
+ m = 0
319
+ for xi in cms_dict[cm]:
320
+ total_distance = 0
321
+ for xj in cms_dict[cm]:
322
+ if(xi != xj):
323
+ total_distance += func(xi,xj,power)
324
+ ai = total_distance//(len(cms_dict[cm])-1)
325
+ bi = None
326
+ for cm2 in cms_dict:
327
+ if( cm !=cm2 ):
328
+ total_distance = 0
329
+ for xj in cms_dict[cm2]:
330
+ total_distance += func(xi,xj,power)
331
+ average = total_distance//len(cms_dict[cm2])
332
+ if(bi is None):
333
+ bi = average
334
+ else:
335
+ if(average < bi ):
336
+ bi = average
337
+ si = float(bi - ai) / max(ai,bi)
338
+ silhouetee_coefficent += si
339
+ dict ={}
340
+ dict["a{0}".format(m)] = ai
341
+ dict["b{0}".format(m)] = bi
342
+ dict["s{0}".format(m)] = si
343
+ abs.append(dict)
344
+ m+=1
345
+ else:
346
+ dict ={}
347
+ dict["a0"] = "Undefined"
348
+ dict["b0"] = "Undefined"
349
+ dict["s0"] = "0 by defintion"
350
+ abs.append(dict)
351
+ print("Cluster {0}: {1}".format(i,cms_dict[cm]))
352
+ print(abs)
353
+ print("------------------------------------------------")
354
+ if( K > 1):
355
+ print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
356
+ else:
357
+ print("Silhouetee Coefficent is not defined for K = 1")
358
+ break
359
+
360
+ else:
361
+ print("Current SSE: {0}".format(current_sse))
362
+ previous_sse = current_sse
363
+
364
+
365
+ def main(argv):
366
+ algo = {
367
+ "euclidean": lambda k,pow: k_means_clustering(k,minkowski_distance,2),
368
+ "minkowski": lambda k,pow: k_means_clustering(k,minkowski_distance,pow),
369
+ "chebyshev": lambda k,pow,: k_means_clustering(k,chebyshev_distance),
370
+ "median": lambda k,pow: k_median_clustering(k),
371
+ "medoids": lambda k,pow: k_medoids_clustering(k,minkowski_distance,pow)
372
+ }
373
+ for k in range(1,int(argv[1])+1):
374
+ algo[str(argv[0])](k,float(argv[1]) if (len(argv) > 2 and argv[2].replace('.','',1).isdigit()) else 2 )
375
+ print("===========================================================================")
376
+
377
+ if __name__ == "__main__":
378
+ if(len(sys.argv) == 1):
379
+ print("Missing Arugments")
380
+ else:
381
+ main(sys.argv[1:])
decision_tree.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import log2
2
+
3
+ def entropy(p,n):
4
+ if p == 0 or n == 0:
5
+ return 0
6
+ else:
7
+ return -1 * p/(p+n) * log2(p/(p+n)) - n/(p+n) *log2(n/(p+n))
8
+
9
+ def info_gain(hy,list_postive, list_negative):
10
+ p1 = 0
11
+ n1 = 0
12
+ p2 = 0
13
+ n2 = 0
14
+ for i in range(len(list_postive)):
15
+ if(i == 1):
16
+ p1 = p1 + 1
17
+ if(i == 0):
18
+ n1 = n1 + 1
19
+ for i in range(len(list_negative)):
20
+ if(i == 1):
21
+ p2 = p1 + 1
22
+ if(i == 0):
23
+ n2 = n1 + 1
24
+ return hy - (len(list_postive)/(len(list_postive) + len(list_negative)) * entropy(p1, n1) + len(list_negative)/(len(list_postive) + len(list_negative)) * entropy(p2,n2))
gradient_descent.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ X = [[[1,3.04] [1,3.64],[1,4.61],[1,5.57],[1,6.74], [1,7.77]]
2
+ Y = [0.94,1.01,1.09,1.11,1.20,1.30]
3
+ w = [0,0]
4
+ iteration = 0
5
+ rate = 0.01
6
+ while(iteration < 1000000):
7
+ i = 0
8
+ for i in range(len(X)):
9
+ for j in range(X[i]):
10
+ p += w[j] * X[i][j]
11
+ delta = Y[i] - p
12
+ for n in range(len(w)):
13
+ pass
k-mean-clustering.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import math
3
+ import sys
4
+
5
+ def distance(x1, x2):
6
+ return ((abs(x1[0] - x2[0]))**2 + (abs(x1[1] - x2[1])**2)) **(1/2)
7
+
8
+ data_points = [(1,7),(1,11),(3,17),(7,18),(8,4),(8,12),(11,7),(12,14),(13,17),(16,11)]
9
+
10
+ #run all 1,2,3..,10-means clustering
11
+ for K in range(1,len(data_points)+1):
12
+ print("==============================")
13
+ print("K: {0}".format(K))
14
+
15
+ # pick random k pointS
16
+ C = random.sample(data_points,K)
17
+
18
+ #prevent infinite loop criteria
19
+ iteration = 1000000
20
+
21
+ #converage criteria for minmumal decrease in sum of square errors
22
+ min_decrease_sse = 1e-5
23
+
24
+ #intital conditions for comparsions
25
+ previous_sse = float("inf")
26
+ current_iteration = 0
27
+
28
+ while(True):
29
+ current_iteration +=1
30
+ current_sse = 0.0
31
+
32
+ #create cluster membership dictonary for each centroids
33
+ cms_dict = {}
34
+ for c in C:
35
+ cms_dict[c] = []
36
+
37
+ for x in data_points:
38
+ max = float("inf")
39
+ closest = ()
40
+ #compute the distance from x to each centroid
41
+ for c in C:
42
+ d= distance(x,c)
43
+ if(d <= max):
44
+ max = d
45
+ closest = c
46
+
47
+ #assign x to the closet centroid and its cluster memberships
48
+ cms_dict[closest].append(x)
49
+
50
+ #recomputing new centroids
51
+ C = []
52
+ for cm in cms_dict:
53
+ cm_total_distance = 0.0
54
+ new_c_x = 0.0
55
+ new_c_y = 0.0
56
+
57
+ #recompute the centroids using the current cluster memberships
58
+ for x in cms_dict[cm]:
59
+ new_c_x += x[0]
60
+ new_c_y += x[1]
61
+ new_c_x /= len(cms_dict[cm])
62
+ new_c_y /= len(cms_dict[cm])
63
+ C.append((new_c_x,new_c_y))
64
+
65
+ #calucation the sum of squared error
66
+ for x in cms_dict[cm]:
67
+ cm_total_distance += distance(x,(new_c_x,new_c_y))**2
68
+ current_sse += cm_total_distance
69
+
70
+ #getting the decrease value in the sse
71
+ if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
72
+ print("Final SSE: {0}".format(current_sse))
73
+ print("Final Iteration: {0}".format(current_iteration))
74
+ i = 0
75
+ for cm in cms_dict:
76
+ i+=1
77
+ print("Cluster {0}: {1}".format(i,cms_dict[cm]))
78
+ break
79
+ else:
80
+ print("Current SSE: {0}".format(current_sse))
81
+ previous_sse = current_sse
linear-classifier.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def linear_classifier(c1,c2, iteration, threshold):
2
+ w = [1, 0, 0]
3
+ total = len(c1) + len(c2)
4
+ correct = 0
5
+ step = 0.6
6
+ iteration = 0
7
+ while(correct < total):
8
+ iteration += 1
9
+ if(iteration >= threshold):
10
+ print("The dataset might be not linear separable")
11
+ break
12
+ for x in c1:
13
+ wx = 0
14
+ i = 0
15
+ j = 0
16
+ for xi in x:
17
+ if(i < len(w)):
18
+ wx += xi * w[i]
19
+ i += 1
20
+ if (wx <= 0):
21
+ for wj in w:
22
+ if(j < len(w)):
23
+ w[j] += step * x[j]
24
+ j+=1
25
+ correct = 1
26
+ else:
27
+ correct += 1
28
+ if(correct >= total):
29
+ pass
30
+ for x in c2:
31
+ wx = 0
32
+ i = 0
33
+ j = 0
34
+ for xi in x:
35
+ if(i < len(w)):
36
+ wx += xi * w[i]
37
+ i += 1
38
+ if (wx > 0):
39
+ for wj in w:
40
+ if(j < len(w)):
41
+ w[j] -= step * x[j]
42
+ j+=1
43
+ correct = 1
44
+ else:
45
+ correct += 1
46
+ if(correct >= total):
47
+ pass
48
+ print("Iterations: {0}".format(iteration))
49
+ print("=======Final Weights========================")
50
+ i = 0
51
+ for wi in w:
52
+ print("w{0}: {1}".format(i,str(wi)))
53
+ i+=1
54
+ print("=======Final Dot Products=============")
55
+ for x in c1:
56
+ wx = 0
57
+ i = 0
58
+ for xi in x:
59
+ if(i < len(w)):
60
+ wx += xi * w[i]
61
+ i += 1
62
+ print(wx)
63
+ print("======================================")
64
+ for x in c2:
65
+ wx = 0
66
+ i = 0
67
+ for xi in x:
68
+ if(i < len(w)):
69
+ wx += xi * w[i]
70
+ i += 1
71
+ print(wx)
72
+ print("=======End=================================")
73
+
74
+ print("=======Dataset 1==============================================")
75
+ c1 = [[1, 1, 3, 5], [1, 2, 3, 10], [1, 3, 5, 9]]
76
+ c2 = [[1, -2, -1,-7], [1, -3, -3,-5], [1, -4, 4,-10]]
77
+ linear_classifier(c1,c2,0.6,1000)
78
+ print("=======Dataset 2(not linear separable)==============================================")
79
+ c1 = [[1, 1, 3, 5], [1, -3, -3,-5], [1, 3, 5, 9]]
80
+ c2 = [[1, -2, -1,-7], [1, 2, 3, 10], [1, -4, 4,-10]]
81
+ linear_classifier(c1,c2,0.3,1000000)
naïve-bayes-classifier.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #p(Pass|Bad, A, High) = p(Pass) * p(Bad|Pass) * p(A|Pass) * p(High|Pass)
2
+ #p(Fail|Bad, A, High) = p(Fail) * p(Bad|Fail) * p(A|Fail) * p(High|Fail)
3
+
4
+ dataset = [
5
+ {"Assignment": "Good", "Project": "A", "Exam": "High", "Label": "Pass"},
6
+ {"Assignment": "Good", "Project": "B", "Exam": "High", "Label": "Pass"},
7
+ {"Assignment": "Bad", "Project": "B", "Exam": "Low", "Label": "Fail"},
8
+ {"Assignment": "Bad", "Project": "C", "Exam": "High", "Label": "Fail"},
9
+ {"Assignment": "Good", "Project": "C", "Exam": "Low", "Label": "Fail"},
10
+ {"Assignment": "Good", "Project": "C", "Exam": "High", "Label": "Pass"},
11
+ {"Assignment": "Bad", "Project": "B", "Exam": "High", "Label": "Pass"},
12
+ {"Assignment": "Good", "Project": "A", "Exam": "Low", "Label": "Pass"},
13
+ {"Assignment": "Bad", "Project": "A", "Exam": "Low", "Label": "Fail"},
14
+ {"Assignment": "Good", "Project": "B", "Exam": "Low", "Label": "Pass"}
15
+ ]
16
+
17
+ #P(c=ci)
18
+ def prior(c,ci):
19
+ total = len(dataset)
20
+ count = 0.0
21
+ for student in dataset:
22
+ if(student[c] is not None and student[c] == ci):
23
+ count+=1
24
+ return count/total
25
+
26
+ #P(f=fi|c=ci)
27
+ def likelihood(f, fi, c, ci):
28
+ c_count = 0.0
29
+ f_count = 0.0
30
+ for student in dataset:
31
+ if(student[c] is not None and student[c] == ci):
32
+ if(student[f] is not None and student[f] == fi):
33
+ f_count+=1
34
+ c_count+=1
35
+ if(c_count > 0.0):
36
+ return f_count/c_count
37
+ return None
38
+
39
+ #P(C=ci|f1,f2,f3,...fn) = p(ci) * p(f1|ci) * p(f2|ci) * ... * p(fn|ci)
40
+ def posterior(c,ci, feature_dictonary):
41
+ p_c = prior(c, ci)
42
+ for key in feature_dictonary.keys():
43
+ p_c *= likelihood(key,feature_dictonary[key],c,ci)
44
+ return p_c
45
+
46
+ print("Probability for passing:{0}".format(posterior("Label","Pass",{"Assignment": "Bad", "Project": "A", "Exam": "High"})))
47
+ print("Probability for failing:{0}".format(posterior("Label","Fail",{"Assignment": "Bad", "Project": "A", "Exam": "High"})))