Spaces:

luismidv
/

MLSystemTFG

Sleeping

App Files Files Community

luismidv commited on Mar 7, 2025

Commit

48e3a09

1 Parent(s): 44a2327

Db update

Browse files

Files changed (15) hide show

Dockerfile +13 -0
__pycache__/datasetgenerator.cpython-311.pyc +0 -0
__pycache__/datasetgenerator.cpython-312.pyc +0 -0
__pycache__/resultview.cpython-311.pyc +0 -0
__pycache__/resultview.cpython-312.pyc +0 -0
__pycache__/similarity.cpython-311.pyc +0 -0
__pycache__/similarity.cpython-312.pyc +0 -0
app.py +14 -0
data/users_dataframe.csv +0 -0
datasetgenerator.py +51 -0
kmeans.py +77 -0
kmeans_model.pkl +3 -0
requirements.txt +6 -0
resultview.py +61 -0
similarity.py +31 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "/app/app.py", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/datasetgenerator.cpython-311.pyc ADDED Viewed

Binary file (3.45 kB). View file

__pycache__/datasetgenerator.cpython-312.pyc ADDED Viewed

Binary file (2.86 kB). View file

__pycache__/resultview.cpython-311.pyc ADDED Viewed

Binary file (3.57 kB). View file

__pycache__/resultview.cpython-312.pyc ADDED Viewed

Binary file (2.18 kB). View file

__pycache__/similarity.cpython-311.pyc ADDED Viewed

Binary file (2.18 kB). View file

__pycache__/similarity.cpython-312.pyc ADDED Viewed

Binary file (2.15 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from fastapi import FastAPI
+import resultview as rv
+import uvicorn
+app = FastAPI()
+app.post("/predict/")
+async def predict(id):
+    tenant_list = rv.algo_start(id)
+if __name__ =="__main__":
+    uvicorn.run(app,host = "127.0.0.1", port=7860)

data/users_dataframe.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

datasetgenerator.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import pandas as pd
+import numpy as np
+from  sklearn.preprocessing import OneHotEncoder
+def list_creator(list):
+    new_list =[list[i] for i in np.random.randint(0,len(list), 10000)]
+    return new_list
+def data_generator():
+    name = ['John', 'Michael', 'Derek', 'Nick', 'Lucas', 'Jorge', 'George', 'Miguel', 'Anthony', 'Antonio', 'Mario', 'Marie' , 'Luna', 'Maria', 'Albert', 'Louisa'
+        'Loren', 'Josephine']
+    surname = ['Bush', 'Smith', 'Jones', 'Williams', 'Brown','Taylor', 'Davies ', 'Evans ', 'Williams', 'Thomas ','Johnson', 'Roberts ', 'Lee ', 'Walker ', 'Wright'
+           ,'Robinson ', 'Thompson ', 'White', 'Hughes ', 'Edwards ']
+    work_options = ['morning', 'night']
+    morning_night = ['morning', 'night']
+    studies_level = ['secondary', 'university']
+    yes_no_questions = ['Yes', 'No']
+    name_list = list_creator(name)
+    surname_list = list_creator(surname)
+    email_list = [name_list[i] + surname_list[i] + '@gmail.com' for i in range(len(name_list))]
+    age_list = np.random.randint(18,35,10000)
+    work_list = list_creator(work_options)
+    morn_night_list = list_creator(morning_night)
+    studies_list = list_creator(studies_level)
+    pets_list = list_creator(yes_no_questions)
+    cooking_list = list_creator(yes_no_questions)
+    sport_list = list_creator(yes_no_questions)
+    smoking_list = list_creator(yes_no_questions)
+    organized_list = list_creator(yes_no_questions)
+    id_list = np.arange(1,10001,1)
+    users_dataframe = pd.DataFrame(list(zip( name_list, surname_list, age_list, email_list, work_list, morn_night_list, studies_list,
+                                            pets_list, cooking_list, sport_list, smoking_list, organized_list)),
+    columns =['Names', 'Surnames','Age', 'Email','Worktimes', 'Schedules', 'Studies level', 'Pets', 'Cooking', 'Sport', 'Smoking', 'Organized'])
+    users_dataframe.index = id_list
+    users_dataframe.index.name = "id"
+    users_dataframe.to_csv('./MLSystem/data/users_dataframe.csv')
+    print(users_dataframe)
+    return users_dataframe
+data_generator()

kmeans.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import pandas as pd
+import numpy as np
+from  sklearn.preprocessing import OneHotEncoder
+from sklearn.cluster import KMeans
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import LabelEncoder
+from scipy.spatial.distance import euclidean
+import joblib
+class k_means_algo():
+    def data_preparing(self,person):
+        #DATAFRAME WITH DATA
+        new_dataframe = pd.DataFrame(columns=person)
+        print(new_dataframe.columns)
+        original_dataframe = pd.read_csv('./MLSystem/data/users_dataframe.csv')
+        print(original_dataframe.columns)
+        id = original_dataframe["id"].iloc[-1]
+        last_id = id + 1
+        new_row_data = [last_id, "John", "Doe", 30, "john.doe@example.com", "Morning", "Night",
+                "Bachelor", "Yes", "Yes", "Yes", "No", "Yes"]
+        new_row_dataframe = pd.DataFrame([new_row_data], columns=person)
+        self.result_data = pd.concat([original_dataframe, new_row_dataframe]).set_index("id")
+    def data_checking(self,dataframe):
+        for col in dataframe.columns:
+            if dataframe[col].isnull().sum() > 0:
+                print(f"Missing values in {col} column")
+            else:
+                print(f"No missing values in column {col}")
+    def reshape_playground(self,data):
+        print(f"Data shape {data.shape}")
+        data[50].reshape(17,1)
+        print(data.shape)
+    def forward_algorithm(self,dataframe,cluster_spec):
+        cluster_spec = dataframe[0]
+        kmeans = KMeans(n_clusters=4, random_state=42)
+        result = kmeans.fit_predict(dataframe)
+        print(result[1])
+        print(result)
+    def set_specific_cluster(self,dataframe,cluster_spec):
+        cluster_spec = dataframe[cluster_spec]
+        distances = [euclidean(cluster_spec,point) for point in dataframe]
+        print(distances)
+    def specific_cluster_kmeans(self):
+        id = self.result_data["id"].iloc[-1]
+        print(self.result_data)
+        cluster_spec = self.result_data[id]
+        cluster_spec = cluster_spec.reshape(1,17)
+        print(cluster_spec.shape)
+        kmeans = KMeans(n_clusters=1, init = cluster_spec, n_init = 1, random_state=42)
+        data = kmeans.fit(dataframe)
+        joblib.dump(kmeans, 'MLSystem/kmeans_model.pkl')
+my_kmeans = k_means_algo()
+person = ["id","Names","Surnames","Age","Email","Worktimes","Schedules","Studies level","Pets","Cooking","Sport","Smoking","Organized"]
+my_kmeans.data_preparing(person)
+#forward_algorithm(dataframe,50)
+#set_specific_cluster(dataframe,50)
+#my_kmeans.specific_cluster_kmeans()

kmeans_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe0628e084eabce82fe327dfaff97e43e7d91613a26669b53ea405734a696901
+size 41063

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn[standard]
+scikit-learn
+pandas
+numpy
+joblib

resultview.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import pandas as pd
+def tenant_visualization(similarity_matrix, requested_tenants):
+    #TODO VIEW COMPATIBILITY BETWEEN REQUESTED TENANTS
+    print(f"The options for visualizatio are:\n"+
+          "1.Compatibility for each tenants with requested ones\n"+
+          "2.Mean compatibility for each tenant\n"+
+          "3.Most compatible tenant for the requested ones\n"+
+          "4.Obtain registers from most compatible tenants\n")
+    option = int(input("What option are u choosing: "))
+    tenant_lines = similarity_matrix[requested_tenants].mean()
+    match(option):
+        case 1:
+            tenant_lines.head(5)
+            print(f"Compatibility for each tenants with requested ones \n{tenant_lines}")
+        case 2:
+            mean_compatibility = tenant_lines.mean(axis = 0)
+            print(f"Mean compatibility for each tenant:\n {mean_compatibility} ")
+        case 3:
+            most_compatible = tenant_lines.sort_values(ascending = False)
+            print(f"Max compatibility for each tenant:\n {most_compatible} ")
+        case 4:
+            most_compatible = tenant_lines.sort_values(ascending = False)
+            most_compatible = dataframe.loc[requested_tenants]
+            print(f"Most compatible tenants registers\n {most_compatible}")
+def view_kmeans_results(results,cluster_center):
+    # TODO FUNCTION TO VIEW KMEANS RESULTS
+    print(f"Starting kmeans viewing \n Cluster length: {results.shape}")
+    plt.scatter(results,results)
+    plt.show()
+def tenant_inference(similarity_matrix, requested_tenants,dataframe):
+    #TODO THIS FUNCTION IS THE ONE USED DURING INFERENCE TIME THE MODEL WILL CALCULATE THE 4 TENANTS WITH THE HIGHER COMPATIBILITY
+    similarity_tenant = similarity_matrix[requested_tenants].head(4).sort_values(ascending = False)
+    final_tenants = similarity_tenant.index
+    tenant_list = []
+    for tenant in final_tenants:
+        similarity = similarity_matrix[requested_tenants][tenant]
+        tenant_tuple = (dataframe.loc[tenant, ['Names', 'Age','Smoking','Email']], float(similarity))
+        tenant_list.append(tenant_tuple)
+    #TODO WE CAN ACCESS THE INFO BY DOING tenant_list[index]['Column_Name]'
+    #for tenant in tenant_list:
+        #print(f"I will present the names of the tenants with the higher similarity: \n {tenant[0]['Names']}")
+    return tenant_list
+def algo_start(id):
+    dataframe, original_dataframe = sm.data_preparing()
+    #sm.data_checking(dataframe)
+    similarity_matrix = sm.encoder_matrix(dataframe, min_range = 0, max_range=100)
+    tenant_list = tenant_inference(similarity_matrix, id,original_dataframe)
+    return tenant_list
+    #tenant_visualization(similarity_matrix, [20,40,50,18,15])

similarity.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+import pandas as pd
+def data_preparing():
+    original_dataframe = pd.read_csv('./MLSystem/data/users_dataframe.csv')
+    columns = ['Age', 'Worktimes', 'Schedules', 'Studies level', 'Pets', 'Cooking', 'Sport', 'Smoking', 'Organized']
+    dataframe = original_dataframe[columns]
+    return dataframe, original_dataframe
+def data_checking(dataframe):
+    for col in dataframe.columns:
+        if dataframe[col].isnull().sum() > 0:
+            print(f"Missing values in {col} column")
+        else:
+            print(f"No missing values in column {col}")
+def encoder_matrix(dataframe, min_range, max_range):
+    encoder = OneHotEncoder(sparse_output = False)
+    data_encoded = encoder.fit_transform(dataframe)
+    encoded_feature_names = encoder.get_feature_names_out()
+    matriz_s = np.dot(data_encoded, data_encoded.T)
+    min_original = np.min(matriz_s)
+    max_original = np.max(matriz_s)
+    matriz_reescalada = ((matriz_s-min_original) / (max_original - min_original))*(max_range - min_range) + min_range
+    new_similarity_matrix = pd.DataFrame(matriz_reescalada, index = dataframe.index, columns = dataframe.index)
+    return new_similarity_matrix