luismidv commited on
Commit
48e3a09
·
1 Parent(s): 44a2327

Db update

Browse files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r /app/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 7860
12
+
13
+ CMD ["python", "/app/app.py", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/datasetgenerator.cpython-311.pyc ADDED
Binary file (3.45 kB). View file
 
__pycache__/datasetgenerator.cpython-312.pyc ADDED
Binary file (2.86 kB). View file
 
__pycache__/resultview.cpython-311.pyc ADDED
Binary file (3.57 kB). View file
 
__pycache__/resultview.cpython-312.pyc ADDED
Binary file (2.18 kB). View file
 
__pycache__/similarity.cpython-311.pyc ADDED
Binary file (2.18 kB). View file
 
__pycache__/similarity.cpython-312.pyc ADDED
Binary file (2.15 kB). View file
 
app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import resultview as rv
3
+ import uvicorn
4
+
5
+
6
+ app = FastAPI()
7
+
8
+
9
+ app.post("/predict/")
10
+ async def predict(id):
11
+ tenant_list = rv.algo_start(id)
12
+
13
+ if __name__ =="__main__":
14
+ uvicorn.run(app,host = "127.0.0.1", port=7860)
data/users_dataframe.csv ADDED
The diff for this file is too large to render. See raw diff
 
datasetgenerator.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from sklearn.preprocessing import OneHotEncoder
5
+
6
+
7
+ def list_creator(list):
8
+
9
+ new_list =[list[i] for i in np.random.randint(0,len(list), 10000)]
10
+ return new_list
11
+
12
+ def data_generator():
13
+ name = ['John', 'Michael', 'Derek', 'Nick', 'Lucas', 'Jorge', 'George', 'Miguel', 'Anthony', 'Antonio', 'Mario', 'Marie' , 'Luna', 'Maria', 'Albert', 'Louisa'
14
+ 'Loren', 'Josephine']
15
+ surname = ['Bush', 'Smith', 'Jones', 'Williams', 'Brown','Taylor', 'Davies ', 'Evans ', 'Williams', 'Thomas ','Johnson', 'Roberts ', 'Lee ', 'Walker ', 'Wright'
16
+ ,'Robinson ', 'Thompson ', 'White', 'Hughes ', 'Edwards ']
17
+ work_options = ['morning', 'night']
18
+ morning_night = ['morning', 'night']
19
+ studies_level = ['secondary', 'university']
20
+ yes_no_questions = ['Yes', 'No']
21
+
22
+
23
+ name_list = list_creator(name)
24
+ surname_list = list_creator(surname)
25
+ email_list = [name_list[i] + surname_list[i] + '@gmail.com' for i in range(len(name_list))]
26
+ age_list = np.random.randint(18,35,10000)
27
+ work_list = list_creator(work_options)
28
+ morn_night_list = list_creator(morning_night)
29
+ studies_list = list_creator(studies_level)
30
+ pets_list = list_creator(yes_no_questions)
31
+ cooking_list = list_creator(yes_no_questions)
32
+ sport_list = list_creator(yes_no_questions)
33
+ smoking_list = list_creator(yes_no_questions)
34
+ organized_list = list_creator(yes_no_questions)
35
+ id_list = np.arange(1,10001,1)
36
+ users_dataframe = pd.DataFrame(list(zip( name_list, surname_list, age_list, email_list, work_list, morn_night_list, studies_list,
37
+ pets_list, cooking_list, sport_list, smoking_list, organized_list)),
38
+ columns =['Names', 'Surnames','Age', 'Email','Worktimes', 'Schedules', 'Studies level', 'Pets', 'Cooking', 'Sport', 'Smoking', 'Organized'])
39
+ users_dataframe.index = id_list
40
+ users_dataframe.index.name = "id"
41
+ users_dataframe.to_csv('./MLSystem/data/users_dataframe.csv')
42
+ print(users_dataframe)
43
+ return users_dataframe
44
+
45
+ data_generator()
46
+
47
+
48
+
49
+
50
+
51
+
kmeans.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import OneHotEncoder
4
+ from sklearn.cluster import KMeans
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.pipeline import Pipeline
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.preprocessing import LabelEncoder
10
+ from scipy.spatial.distance import euclidean
11
+ import joblib
12
+
13
+ class k_means_algo():
14
+
15
+
16
+
17
+ def data_preparing(self,person):
18
+
19
+ #DATAFRAME WITH DATA
20
+ new_dataframe = pd.DataFrame(columns=person)
21
+ print(new_dataframe.columns)
22
+
23
+ original_dataframe = pd.read_csv('./MLSystem/data/users_dataframe.csv')
24
+ print(original_dataframe.columns)
25
+
26
+ id = original_dataframe["id"].iloc[-1]
27
+ last_id = id + 1
28
+
29
+ new_row_data = [last_id, "John", "Doe", 30, "john.doe@example.com", "Morning", "Night",
30
+ "Bachelor", "Yes", "Yes", "Yes", "No", "Yes"]
31
+ new_row_dataframe = pd.DataFrame([new_row_data], columns=person)
32
+ self.result_data = pd.concat([original_dataframe, new_row_dataframe]).set_index("id")
33
+
34
+ def data_checking(self,dataframe):
35
+ for col in dataframe.columns:
36
+ if dataframe[col].isnull().sum() > 0:
37
+ print(f"Missing values in {col} column")
38
+ else:
39
+ print(f"No missing values in column {col}")
40
+
41
+ def reshape_playground(self,data):
42
+ print(f"Data shape {data.shape}")
43
+ data[50].reshape(17,1)
44
+ print(data.shape)
45
+
46
+ def forward_algorithm(self,dataframe,cluster_spec):
47
+ cluster_spec = dataframe[0]
48
+ kmeans = KMeans(n_clusters=4, random_state=42)
49
+ result = kmeans.fit_predict(dataframe)
50
+ print(result[1])
51
+ print(result)
52
+
53
+ def set_specific_cluster(self,dataframe,cluster_spec):
54
+ cluster_spec = dataframe[cluster_spec]
55
+ distances = [euclidean(cluster_spec,point) for point in dataframe]
56
+ print(distances)
57
+
58
+ def specific_cluster_kmeans(self):
59
+ id = self.result_data["id"].iloc[-1]
60
+ print(self.result_data)
61
+ cluster_spec = self.result_data[id]
62
+ cluster_spec = cluster_spec.reshape(1,17)
63
+ print(cluster_spec.shape)
64
+ kmeans = KMeans(n_clusters=1, init = cluster_spec, n_init = 1, random_state=42)
65
+ data = kmeans.fit(dataframe)
66
+ joblib.dump(kmeans, 'MLSystem/kmeans_model.pkl')
67
+
68
+
69
+
70
+ my_kmeans = k_means_algo()
71
+ person = ["id","Names","Surnames","Age","Email","Worktimes","Schedules","Studies level","Pets","Cooking","Sport","Smoking","Organized"]
72
+
73
+ my_kmeans.data_preparing(person)
74
+
75
+ #forward_algorithm(dataframe,50)
76
+ #set_specific_cluster(dataframe,50)
77
+ #my_kmeans.specific_cluster_kmeans()
kmeans_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe0628e084eabce82fe327dfaff97e43e7d91613a26669b53ea405734a696901
3
+ size 41063
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ scikit-learn
4
+ pandas
5
+ numpy
6
+ joblib
resultview.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def tenant_visualization(similarity_matrix, requested_tenants):
4
+ #TODO VIEW COMPATIBILITY BETWEEN REQUESTED TENANTS
5
+ print(f"The options for visualizatio are:\n"+
6
+ "1.Compatibility for each tenants with requested ones\n"+
7
+ "2.Mean compatibility for each tenant\n"+
8
+ "3.Most compatible tenant for the requested ones\n"+
9
+ "4.Obtain registers from most compatible tenants\n")
10
+ option = int(input("What option are u choosing: "))
11
+ tenant_lines = similarity_matrix[requested_tenants].mean()
12
+ match(option):
13
+
14
+ case 1:
15
+ tenant_lines.head(5)
16
+ print(f"Compatibility for each tenants with requested ones \n{tenant_lines}")
17
+
18
+ case 2:
19
+ mean_compatibility = tenant_lines.mean(axis = 0)
20
+ print(f"Mean compatibility for each tenant:\n {mean_compatibility} ")
21
+
22
+ case 3:
23
+ most_compatible = tenant_lines.sort_values(ascending = False)
24
+ print(f"Max compatibility for each tenant:\n {most_compatible} ")
25
+
26
+ case 4:
27
+ most_compatible = tenant_lines.sort_values(ascending = False)
28
+ most_compatible = dataframe.loc[requested_tenants]
29
+ print(f"Most compatible tenants registers\n {most_compatible}")
30
+
31
+ def view_kmeans_results(results,cluster_center):
32
+ # TODO FUNCTION TO VIEW KMEANS RESULTS
33
+ print(f"Starting kmeans viewing \n Cluster length: {results.shape}")
34
+ plt.scatter(results,results)
35
+ plt.show()
36
+
37
+ def tenant_inference(similarity_matrix, requested_tenants,dataframe):
38
+ #TODO THIS FUNCTION IS THE ONE USED DURING INFERENCE TIME THE MODEL WILL CALCULATE THE 4 TENANTS WITH THE HIGHER COMPATIBILITY
39
+ similarity_tenant = similarity_matrix[requested_tenants].head(4).sort_values(ascending = False)
40
+ final_tenants = similarity_tenant.index
41
+ tenant_list = []
42
+
43
+ for tenant in final_tenants:
44
+ similarity = similarity_matrix[requested_tenants][tenant]
45
+ tenant_tuple = (dataframe.loc[tenant, ['Names', 'Age','Smoking','Email']], float(similarity))
46
+ tenant_list.append(tenant_tuple)
47
+
48
+ #TODO WE CAN ACCESS THE INFO BY DOING tenant_list[index]['Column_Name]'
49
+ #for tenant in tenant_list:
50
+ #print(f"I will present the names of the tenants with the higher similarity: \n {tenant[0]['Names']}")
51
+ return tenant_list
52
+
53
+ def algo_start(id):
54
+
55
+ dataframe, original_dataframe = sm.data_preparing()
56
+ #sm.data_checking(dataframe)
57
+ similarity_matrix = sm.encoder_matrix(dataframe, min_range = 0, max_range=100)
58
+ tenant_list = tenant_inference(similarity_matrix, id,original_dataframe)
59
+ return tenant_list
60
+
61
+ #tenant_visualization(similarity_matrix, [20,40,50,18,15])
similarity.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.preprocessing import OneHotEncoder
3
+ import pandas as pd
4
+
5
+ def data_preparing():
6
+ original_dataframe = pd.read_csv('./MLSystem/data/users_dataframe.csv')
7
+ columns = ['Age', 'Worktimes', 'Schedules', 'Studies level', 'Pets', 'Cooking', 'Sport', 'Smoking', 'Organized']
8
+ dataframe = original_dataframe[columns]
9
+ return dataframe, original_dataframe
10
+
11
+ def data_checking(dataframe):
12
+ for col in dataframe.columns:
13
+ if dataframe[col].isnull().sum() > 0:
14
+ print(f"Missing values in {col} column")
15
+ else:
16
+ print(f"No missing values in column {col}")
17
+
18
+ def encoder_matrix(dataframe, min_range, max_range):
19
+ encoder = OneHotEncoder(sparse_output = False)
20
+ data_encoded = encoder.fit_transform(dataframe)
21
+ encoded_feature_names = encoder.get_feature_names_out()
22
+
23
+ matriz_s = np.dot(data_encoded, data_encoded.T)
24
+
25
+ min_original = np.min(matriz_s)
26
+ max_original = np.max(matriz_s)
27
+ matriz_reescalada = ((matriz_s-min_original) / (max_original - min_original))*(max_range - min_range) + min_range
28
+
29
+ new_similarity_matrix = pd.DataFrame(matriz_reescalada, index = dataframe.index, columns = dataframe.index)
30
+ return new_similarity_matrix
31
+