Upload 6 files

by ewdlop - opened Oct 11, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+627

-0

Files changed (6) hide show

clustering.py +381 -0
decision_tree.py +24 -0
gradient_descent.py +13 -0
k-mean-clustering.py +81 -0
linear-classifier.py +81 -0
naïve-bayes-classifier.py +47 -0

clustering.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import random
+import math
+import sys
+import statistics
+def mahattan_distance_one_dimension(x1,x2):
+    return abs(x1-x2)
+def minkowski_distance(x1, x2, power):
+    return ((abs(x1[0] - x2[0]))**power + (abs(x1[1] - x2[1])**power)) **(1.0/power)
+def chebyshev_distance(x1, x2, power=0):
+    return max(abs(x1[0]-x2[0]),abs(x1[1]-x2[1]))
+one_dimensional_data_points = [5,13,4,6,15,13,32,14,6,10,12,31,12,41,13]
+data_points = [(1,7),(1,-11),(3,17),(7,18),(-8,4),(8,12),(11,-7),(12,14),(13,71),(-16,11),(13,1),(-9,2),(-5,3),(0,12)]
+def k_means_clustering(K,func,power=0):
+    print("==============================")
+    print("K: {0}".format(K))
+    # pick random k pointS
+    C = random.sample(data_points,K)
+    #prevent infinite loop criteria
+    iteration = 1000000
+    #converage criteria for minmumal decrease in sum of square errors
+    min_decrease_sse = 1e-5
+    #intital conditions for comparsions
+    previous_sse = float("inf")
+    current_iteration = 0
+    while(True):
+        current_iteration +=1
+        current_sse = 0.0
+        #create cluster membership dictonary for each centroid
+        cms_dict = {}
+        for c in C:
+            cms_dict[c] = []
+        for x in data_points:
+            max_dist = float("inf")
+            closest = ()
+            #compute the distance from x to each centroid
+            for c in C:
+                d= func(x,c,power)
+                if(d <= max_dist):
+                    max_dist = d
+                    closest = c
+            #assign x to the closet centroid and its cluster memberships
+            cms_dict[closest].append(x)
+        #recomputing new centroids
+        C = []
+        for cm in cms_dict:
+            cm_total_distance = 0.0
+            new_m_x = 0.0
+            new_m_y = 0.0
+            #recompute the centroids using the current cluster memberships
+            for x in cms_dict[cm]:
+                new_m_x += x[0]
+                new_m_y += x[1]
+            new_m_x /= len(cms_dict[cm])
+            new_m_y /= len(cms_dict[cm])
+            C.append((new_m_x,new_m_y))
+            #calucation the sum of squared error
+            for x in cms_dict[cm]:
+                cm_total_distance += func(x,(new_m_x,new_m_y),power)**2
+            current_sse += cm_total_distance
+        #getting the decrease value in the sse
+        if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
+            print("Final SSE: {0}".format(current_sse))
+            print("Final Iteration: {0}".format(current_iteration))
+            i = 0
+            silhouetee_coefficent = 0.0
+            #calculate average silhouetee coefficent
+            for cm in cms_dict:
+                i+=1
+                abs = []
+                if(len(cms_dict[cm]) != 1):
+                    if(K > 1):
+                        m = 0
+                        for xi in cms_dict[cm]:
+                            total_distance = 0
+                            for xj in cms_dict[cm]:
+                                if(xi != xj):
+                                    total_distance += func(xi,xj,power)
+                            ai = total_distance//(len(cms_dict[cm])-1)
+                            bi = None
+                            for cm2 in cms_dict:
+                                if( cm !=cm2 ):
+                                    total_distance = 0
+                                    for xj in cms_dict[cm2]:
+                                        total_distance += func(xi,xj,power)
+                                    average = total_distance//len(cms_dict[cm2])
+                                    if(bi is None):
+                                        bi = average
+                                    else:
+                                        if(average < bi ):
+                                            bi = average
+                            si = float(bi - ai) / max(ai,bi)
+                            silhouetee_coefficent += si
+                            dict ={}
+                            dict["a{0}".format(m)] = ai
+                            dict["b{0}".format(m)] = bi
+                            dict["s{0}".format(m)] = si
+                            abs.append(dict)
+                            m+=1
+                else:
+                    dict ={}
+                    dict["a0"] = "Undefined"
+                    dict["b0"] = "Undefined"
+                    dict["s0"] = "0 by defintion"
+                    abs.append(dict)
+                print("Cluster {0}: {1}".format(i,cms_dict[cm]))
+                print(abs)
+                print("------------------------------------------------")
+            if( K > 1):
+                print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
+            else:
+                print("Silhouetee Coefficent is not defined for K = 1")
+            break
+        else:
+            print("Current SSE: {0}".format(current_sse))
+            previous_sse = current_sse
+def k_median_clustering(K):
+    print("==============================")
+    print("K: {0}".format(K))
+    # pick random k pointS
+    C = random.sample(one_dimensional_data_points,K)
+    #prevent infinite loop criteria
+    iteration = 1000000
+    #converage criteria for minmumal decrease in sum of errors
+    min_decrease_se = 1e-5
+    #intital condiions for comparsions
+    previous_se = float("inf")
+    current_iteration = 0
+    while(True):
+        current_iteration +=1
+        current_se = 0.0
+        #create cluster membership dictonary for each median
+        cms_dict = {}
+        for c in C:
+            cms_dict[c] = []
+        for x in one_dimensional_data_points:
+            max_dist = float("inf")
+            closest = None
+            #compute the distance from x to each median
+            for c in C:
+                d = mahattan_distance_one_dimension(x,c,)
+                if(d <= max_dist):
+                    max_dist = d
+                    closest = c
+            #assign x to the closet median and its cluster memberships
+            cms_dict[closest].append(x)
+        #recomputing new centroids
+        C = []
+        for cm in cms_dict:
+            cm_total_distance = 0.0
+            new_m_x = 0.0
+            #recompute the median using the current cluster memberships
+            new_m_x = statistics.median(cms_dict[cm])
+            C.append(new_m_x)
+            #calucation the sum of error
+            for x in cms_dict[cm]:
+                cm_total_distance += mahattan_distance_one_dimension(x,(new_m_x))
+            current_se += cm_total_distance
+        #getting the decrease value in the sse
+        if(previous_se - current_se <= min_decrease_se or current_iteration > iteration):
+            print("Final SSE: {0}".format(current_se))
+            print("Final Iteration: {0}".format(current_iteration))
+            i = 0
+            silhouetee_coefficent = 0.0
+            #calculate average silhouetee coefficent
+            for cm in cms_dict:
+                i+=1
+                abs = []
+                if(len(cms_dict[cm]) != 1):
+                    if(K > 1):
+                        m = 0
+                        for xi in cms_dict[cm]:
+                            total_distance = 0
+                            for xj in cms_dict[cm]:
+                                if(xi != xj):
+                                    total_distance += mahattan_distance_one_dimension(xi,xj)
+                            ai = total_distance//(len(cms_dict[cm])-1)
+                            bi = None
+                            for cm2 in cms_dict:
+                                if( cm !=cm2 ):
+                                    total_distance = 0
+                                    for xj in cms_dict[cm2]:
+                                        total_distance += mahattan_distance_one_dimension(xi,xj)
+                                    average = total_distance//len(cms_dict[cm2])
+                                    if(bi is None):
+                                        bi = average
+                                    else:
+                                        if(average < bi ):
+                                            bi = average
+                            si = float(bi - ai) / max(ai,bi)
+                            silhouetee_coefficent += si
+                            dict ={}
+                            dict["a{0}".format(m)] = ai
+                            dict["b{0}".format(m)] = bi
+                            dict["s{0}".format(m)] = si
+                            abs.append(dict)
+                            m+=1
+                else:
+                    dict ={}
+                    dict["a0"] = "Undefined"
+                    dict["b0"] = "Undefined"
+                    dict["s0"] = "0 by defintion"
+                    abs.append(dict)
+                print("Cluster {0}: {1}, Median: {2}".format(i,cms_dict[cm],statistics.median(cms_dict[cm])))
+                print(abs)
+                print("------------------------------------------------")
+            if( K > 1):
+                print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent / len(data_points)))
+            else:
+                print("Silhouetee Coefficent is not defined for K = 1")
+            break
+        else:
+            print("Current SSE: {0}".format(current_se))
+            previous_se = current_se
+def k_medoids_clustering(K,func,power=0):
+    print("==============================")
+    print("K: {0}".format(K))
+    # pick random k pointS
+    C = random.sample(data_points,K)
+    #prevent infinite loop criteria
+    iteration = 1000000
+    #converage criteria for minmumal decrease in sum of square errors
+    min_decrease_sse = 1e-5
+    #intital conditions for comparsions
+    previous_sse = float("inf")
+    current_iteration = 0
+    while(True):
+        current_iteration +=1
+        current_sse = 0.0
+        #create cluster membership dictonary for each medoid
+        cms_dict = {}
+        for c in C:
+            cms_dict[c] = []
+        for x in data_points:
+            max_dist = float("inf")
+            closest = ()
+            #compute the distance from x to each medoid
+            for c in C:
+                d= func(x,c,power)
+                if(d <= max_dist):
+                    max_dist = d
+                    closest = c
+            #assign x to the closet centroid and its cluster memberships
+            cms_dict[closest].append(x)
+        #recomputing new medoid
+        C = []
+        for m in cms_dict:
+            max_dist = float("inf")
+            new_c = None
+            #recompute the medoids using the current cluster memberships
+            for x1 in cms_dict[m]:
+                cm_total_distance = 0.0
+                for x2 in cms_dict[m]:
+                    cm_total_distance += func(x1, x2, power)
+                if(cm_total_distance <= max_dist):
+                    max_dist = cm_total_distance
+                    new_c = x1
+            C.append(new_c)
+            #calucation the sum of squared error
+            cm_total_distance = 0.0
+            for x in cms_dict[m]:
+                cm_total_distance += func(x,new_c,power)**2
+            current_sse += cm_total_distance
+        #getting the decrease value in the sse
+        if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
+            print("Final SSE: {0}".format(current_sse))
+            print("Final Iteration: {0}".format(current_iteration))
+            silhouetee_coefficent = 0.0
+            i = 0
+            #calculate average silhouetee coefficent
+            for cm in cms_dict:
+                i+=1
+                abs = []
+                if(len(cms_dict[cm]) != 1):
+                    if(K > 1):
+                        m = 0
+                        for xi in cms_dict[cm]:
+                            total_distance = 0
+                            for xj in cms_dict[cm]:
+                                if(xi != xj):
+                                    total_distance += func(xi,xj,power)
+                            ai = total_distance//(len(cms_dict[cm])-1)
+                            bi = None
+                            for cm2 in cms_dict:
+                                if( cm !=cm2 ):
+                                    total_distance = 0
+                                    for xj in cms_dict[cm2]:
+                                        total_distance += func(xi,xj,power)
+                                    average = total_distance//len(cms_dict[cm2])
+                                    if(bi is None):
+                                        bi = average
+                                    else:
+                                        if(average < bi ):
+                                            bi = average
+                            si = float(bi - ai) / max(ai,bi)
+                            silhouetee_coefficent += si
+                            dict ={}
+                            dict["a{0}".format(m)] = ai
+                            dict["b{0}".format(m)] = bi
+                            dict["s{0}".format(m)] = si
+                            abs.append(dict)
+                            m+=1
+                else:
+                    dict ={}
+                    dict["a0"] = "Undefined"
+                    dict["b0"] = "Undefined"
+                    dict["s0"] = "0 by defintion"
+                    abs.append(dict)
+                print("Cluster {0}: {1}".format(i,cms_dict[cm]))
+                print(abs)
+                print("------------------------------------------------")
+            if( K > 1):
+                print("Average Silhouetee Coefficent:{0}".format(silhouetee_coefficent/len(data_points)))
+            else:
+                print("Silhouetee Coefficent is not defined for K = 1")
+            break
+        else:
+            print("Current SSE: {0}".format(current_sse))
+            previous_sse = current_sse
+def main(argv):
+    algo = {
+        "euclidean": lambda k,pow: k_means_clustering(k,minkowski_distance,2),
+        "minkowski":  lambda k,pow: k_means_clustering(k,minkowski_distance,pow),
+        "chebyshev": lambda k,pow,: k_means_clustering(k,chebyshev_distance),
+        "median": lambda k,pow: k_median_clustering(k),
+        "medoids": lambda k,pow: k_medoids_clustering(k,minkowski_distance,pow)
+    }
+    for k in range(1,int(argv[1])+1):
+        algo[str(argv[0])](k,float(argv[1]) if (len(argv) > 2 and argv[2].replace('.','',1).isdigit()) else 2 )
+        print("===========================================================================")
+if __name__ == "__main__":
+    if(len(sys.argv) == 1):
+       print("Missing Arugments")
+    else:
+        main(sys.argv[1:])

decision_tree.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from math import log2
+def entropy(p,n):
+    if p == 0 or n == 0:
+        return 0
+    else:
+        return -1 * p/(p+n) * log2(p/(p+n)) - n/(p+n) *log2(n/(p+n))
+def info_gain(hy,list_postive, list_negative):
+    p1 = 0
+    n1 = 0
+    p2 = 0
+    n2 = 0
+    for i in range(len(list_postive)):
+        if(i == 1):
+            p1 = p1 + 1
+        if(i == 0):
+            n1 = n1 + 1
+    for i in range(len(list_negative)):
+        if(i == 1):
+            p2 = p1 + 1
+        if(i == 0):
+            n2 = n1 + 1
+    return hy - (len(list_postive)/(len(list_postive) + len(list_negative)) * entropy(p1, n1) + len(list_negative)/(len(list_postive) + len(list_negative)) * entropy(p2,n2))

gradient_descent.py ADDED Viewed

	@@ -0,0 +1,13 @@

+X = [[[1,3.04] [1,3.64],[1,4.61],[1,5.57],[1,6.74], [1,7.77]]
+Y = [0.94,1.01,1.09,1.11,1.20,1.30]
+w = [0,0]
+iteration = 0
+rate = 0.01
+while(iteration < 1000000):
+    i = 0
+    for i in range(len(X)):
+        for j in range(X[i]):
+            p += w[j] * X[i][j]
+        delta = Y[i] - p
+        for n in range(len(w)):
+            pass

k-mean-clustering.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import random
+import math
+import sys
+def distance(x1, x2):
+    return ((abs(x1[0] - x2[0]))**2 + (abs(x1[1] - x2[1])**2)) **(1/2)
+data_points = [(1,7),(1,11),(3,17),(7,18),(8,4),(8,12),(11,7),(12,14),(13,17),(16,11)]
+#run all 1,2,3..,10-means clustering
+for K in range(1,len(data_points)+1):
+    print("==============================")
+    print("K: {0}".format(K))
+    # pick random k pointS
+    C = random.sample(data_points,K)
+    #prevent infinite loop criteria
+    iteration = 1000000
+    #converage criteria for minmumal decrease in sum of square errors
+    min_decrease_sse = 1e-5
+    #intital conditions for comparsions
+    previous_sse = float("inf")
+    current_iteration = 0
+    while(True):
+        current_iteration +=1
+        current_sse = 0.0
+        #create cluster membership dictonary for each centroids
+        cms_dict = {}
+        for c in C:
+            cms_dict[c] = []
+        for x in data_points:
+            max = float("inf")
+            closest = ()
+            #compute the distance from x to each centroid
+            for c in C:
+                d= distance(x,c)
+                if(d <= max):
+                    max = d
+                    closest = c
+            #assign x to the closet centroid and its cluster memberships
+            cms_dict[closest].append(x)
+        #recomputing new centroids
+        C = []
+        for cm in cms_dict:
+            cm_total_distance = 0.0
+            new_c_x = 0.0
+            new_c_y = 0.0
+            #recompute the centroids using the current cluster memberships
+            for x in cms_dict[cm]:
+                new_c_x += x[0]
+                new_c_y += x[1]
+            new_c_x /= len(cms_dict[cm])
+            new_c_y /= len(cms_dict[cm])
+            C.append((new_c_x,new_c_y))
+            #calucation the sum of squared error
+            for x in cms_dict[cm]:
+                cm_total_distance += distance(x,(new_c_x,new_c_y))**2
+            current_sse += cm_total_distance
+        #getting the decrease value in the sse
+        if(previous_sse - current_sse <= min_decrease_sse or current_iteration > iteration):
+            print("Final SSE: {0}".format(current_sse))
+            print("Final Iteration: {0}".format(current_iteration))
+            i = 0
+            for cm in cms_dict:
+                i+=1
+                print("Cluster {0}: {1}".format(i,cms_dict[cm]))
+            break
+        else:
+            print("Current SSE: {0}".format(current_sse))
+            previous_sse = current_sse

linear-classifier.py ADDED Viewed

	@@ -0,0 +1,81 @@

+def linear_classifier(c1,c2, iteration, threshold):
+    w = [1, 0, 0]
+    total = len(c1) + len(c2)
+    correct = 0
+    step = 0.6
+    iteration = 0
+    while(correct < total):
+        iteration += 1
+        if(iteration >= threshold):
+            print("The dataset might be not linear separable")
+            break
+        for x in c1:
+            wx = 0
+            i = 0
+            j = 0
+            for xi in x:
+                if(i < len(w)):
+                    wx += xi * w[i]
+                    i += 1
+            if (wx <= 0):
+                for wj in w:
+                    if(j < len(w)):
+                        w[j] += step * x[j]
+                        j+=1
+                correct = 1
+            else:
+                correct += 1
+            if(correct >= total):
+                pass
+        for x in c2:
+            wx = 0
+            i = 0
+            j = 0
+            for xi in x:
+                if(i < len(w)):
+                    wx += xi * w[i]
+                    i += 1
+            if (wx > 0):
+                for wj in w:
+                    if(j < len(w)):
+                        w[j] -= step * x[j]
+                        j+=1
+                correct = 1
+            else:
+                correct += 1
+            if(correct >= total):
+                pass
+    print("Iterations: {0}".format(iteration))
+    print("=======Final Weights========================")
+    i = 0
+    for wi in w:
+        print("w{0}: {1}".format(i,str(wi)))
+        i+=1
+    print("=======Final Dot Products=============")
+    for x in c1:
+        wx = 0
+        i = 0
+        for xi in x:
+            if(i < len(w)):
+                wx += xi * w[i]
+                i += 1
+        print(wx)
+    print("======================================")
+    for x in c2:
+        wx = 0
+        i = 0
+        for xi in x:
+            if(i < len(w)):
+                wx += xi * w[i]
+                i += 1
+        print(wx)
+    print("=======End=================================")
+print("=======Dataset 1==============================================")
+c1 = [[1, 1, 3, 5], [1, 2, 3, 10], [1, 3, 5, 9]]
+c2 = [[1, -2, -1,-7], [1, -3, -3,-5], [1, -4, 4,-10]]
+linear_classifier(c1,c2,0.6,1000)
+print("=======Dataset 2(not linear separable)==============================================")
+c1 = [[1, 1, 3, 5], [1, -3, -3,-5], [1, 3, 5, 9]]
+c2 = [[1, -2, -1,-7], [1, 2, 3, 10], [1, -4, 4,-10]]
+linear_classifier(c1,c2,0.3,1000000)

naïve-bayes-classifier.py ADDED Viewed

	@@ -0,0 +1,47 @@

+#p(Pass|Bad, A, High) = p(Pass) * p(Bad|Pass) * p(A|Pass) * p(High|Pass)
+#p(Fail|Bad, A, High) = p(Fail) * p(Bad|Fail) * p(A|Fail) * p(High|Fail)
+dataset = [
+    {"Assignment": "Good", "Project": "A", "Exam": "High", "Label": "Pass"},
+    {"Assignment": "Good", "Project": "B", "Exam": "High", "Label": "Pass"},
+    {"Assignment": "Bad", "Project": "B", "Exam": "Low", "Label": "Fail"},
+    {"Assignment": "Bad", "Project": "C", "Exam": "High", "Label": "Fail"},
+    {"Assignment": "Good", "Project": "C", "Exam": "Low", "Label": "Fail"},
+    {"Assignment": "Good", "Project": "C", "Exam": "High", "Label": "Pass"},
+    {"Assignment": "Bad", "Project": "B", "Exam": "High", "Label": "Pass"},
+    {"Assignment": "Good", "Project": "A", "Exam": "Low", "Label": "Pass"},
+    {"Assignment": "Bad", "Project": "A", "Exam": "Low", "Label": "Fail"},
+    {"Assignment": "Good", "Project": "B", "Exam": "Low", "Label": "Pass"}
+]
+#P(c=ci)
+def prior(c,ci):
+    total = len(dataset)
+    count = 0.0
+    for student in dataset:
+        if(student[c] is not None and student[c] == ci):
+            count+=1
+    return count/total
+#P(f=fi|c=ci)
+def likelihood(f, fi, c, ci):
+    c_count = 0.0
+    f_count = 0.0
+    for student in dataset:
+        if(student[c] is not None and student[c] == ci):
+            if(student[f] is not None and student[f] == fi):
+                f_count+=1
+            c_count+=1
+    if(c_count > 0.0):
+        return f_count/c_count
+    return None
+#P(C=ci|f1,f2,f3,...fn) = p(ci) * p(f1|ci) * p(f2|ci) * ... * p(fn|ci)
+def posterior(c,ci, feature_dictonary):
+    p_c = prior(c, ci)
+    for key in feature_dictonary.keys():
+        p_c *= likelihood(key,feature_dictonary[key],c,ci)
+    return p_c
+print("Probability for passing:{0}".format(posterior("Label","Pass",{"Assignment": "Bad", "Project": "A", "Exam": "High"})))
+print("Probability for failing:{0}".format(posterior("Label","Fail",{"Assignment": "Bad", "Project": "A", "Exam": "High"})))