Spaces:
Sleeping
Sleeping
Db update
Browse files- Dockerfile +13 -0
- __pycache__/datasetgenerator.cpython-311.pyc +0 -0
- __pycache__/datasetgenerator.cpython-312.pyc +0 -0
- __pycache__/resultview.cpython-311.pyc +0 -0
- __pycache__/resultview.cpython-312.pyc +0 -0
- __pycache__/similarity.cpython-311.pyc +0 -0
- __pycache__/similarity.cpython-312.pyc +0 -0
- app.py +14 -0
- data/users_dataframe.csv +0 -0
- datasetgenerator.py +51 -0
- kmeans.py +77 -0
- kmeans_model.pkl +3 -0
- requirements.txt +6 -0
- resultview.py +61 -0
- similarity.py +31 -0
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
COPY requirements.txt .
|
| 7 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 8 |
+
|
| 9 |
+
COPY . .
|
| 10 |
+
|
| 11 |
+
EXPOSE 7860
|
| 12 |
+
|
| 13 |
+
CMD ["python", "/app/app.py", "--host", "0.0.0.0", "--port", "7860"]
|
__pycache__/datasetgenerator.cpython-311.pyc
ADDED
|
Binary file (3.45 kB). View file
|
|
|
__pycache__/datasetgenerator.cpython-312.pyc
ADDED
|
Binary file (2.86 kB). View file
|
|
|
__pycache__/resultview.cpython-311.pyc
ADDED
|
Binary file (3.57 kB). View file
|
|
|
__pycache__/resultview.cpython-312.pyc
ADDED
|
Binary file (2.18 kB). View file
|
|
|
__pycache__/similarity.cpython-311.pyc
ADDED
|
Binary file (2.18 kB). View file
|
|
|
__pycache__/similarity.cpython-312.pyc
ADDED
|
Binary file (2.15 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
import resultview as rv
|
| 3 |
+
import uvicorn
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
app = FastAPI()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
app.post("/predict/")
|
| 10 |
+
async def predict(id):
|
| 11 |
+
tenant_list = rv.algo_start(id)
|
| 12 |
+
|
| 13 |
+
if __name__ =="__main__":
|
| 14 |
+
uvicorn.run(app,host = "127.0.0.1", port=7860)
|
data/users_dataframe.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datasetgenerator.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def list_creator(list):
|
| 8 |
+
|
| 9 |
+
new_list =[list[i] for i in np.random.randint(0,len(list), 10000)]
|
| 10 |
+
return new_list
|
| 11 |
+
|
| 12 |
+
def data_generator():
|
| 13 |
+
name = ['John', 'Michael', 'Derek', 'Nick', 'Lucas', 'Jorge', 'George', 'Miguel', 'Anthony', 'Antonio', 'Mario', 'Marie' , 'Luna', 'Maria', 'Albert', 'Louisa'
|
| 14 |
+
'Loren', 'Josephine']
|
| 15 |
+
surname = ['Bush', 'Smith', 'Jones', 'Williams', 'Brown','Taylor', 'Davies ', 'Evans ', 'Williams', 'Thomas ','Johnson', 'Roberts ', 'Lee ', 'Walker ', 'Wright'
|
| 16 |
+
,'Robinson ', 'Thompson ', 'White', 'Hughes ', 'Edwards ']
|
| 17 |
+
work_options = ['morning', 'night']
|
| 18 |
+
morning_night = ['morning', 'night']
|
| 19 |
+
studies_level = ['secondary', 'university']
|
| 20 |
+
yes_no_questions = ['Yes', 'No']
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
name_list = list_creator(name)
|
| 24 |
+
surname_list = list_creator(surname)
|
| 25 |
+
email_list = [name_list[i] + surname_list[i] + '@gmail.com' for i in range(len(name_list))]
|
| 26 |
+
age_list = np.random.randint(18,35,10000)
|
| 27 |
+
work_list = list_creator(work_options)
|
| 28 |
+
morn_night_list = list_creator(morning_night)
|
| 29 |
+
studies_list = list_creator(studies_level)
|
| 30 |
+
pets_list = list_creator(yes_no_questions)
|
| 31 |
+
cooking_list = list_creator(yes_no_questions)
|
| 32 |
+
sport_list = list_creator(yes_no_questions)
|
| 33 |
+
smoking_list = list_creator(yes_no_questions)
|
| 34 |
+
organized_list = list_creator(yes_no_questions)
|
| 35 |
+
id_list = np.arange(1,10001,1)
|
| 36 |
+
users_dataframe = pd.DataFrame(list(zip( name_list, surname_list, age_list, email_list, work_list, morn_night_list, studies_list,
|
| 37 |
+
pets_list, cooking_list, sport_list, smoking_list, organized_list)),
|
| 38 |
+
columns =['Names', 'Surnames','Age', 'Email','Worktimes', 'Schedules', 'Studies level', 'Pets', 'Cooking', 'Sport', 'Smoking', 'Organized'])
|
| 39 |
+
users_dataframe.index = id_list
|
| 40 |
+
users_dataframe.index.name = "id"
|
| 41 |
+
users_dataframe.to_csv('./MLSystem/data/users_dataframe.csv')
|
| 42 |
+
print(users_dataframe)
|
| 43 |
+
return users_dataframe
|
| 44 |
+
|
| 45 |
+
data_generator()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
kmeans.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 4 |
+
from sklearn.cluster import KMeans
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from sklearn.pipeline import Pipeline
|
| 8 |
+
from sklearn.compose import ColumnTransformer
|
| 9 |
+
from sklearn.preprocessing import LabelEncoder
|
| 10 |
+
from scipy.spatial.distance import euclidean
|
| 11 |
+
import joblib
|
| 12 |
+
|
| 13 |
+
class k_means_algo():
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def data_preparing(self,person):
|
| 18 |
+
|
| 19 |
+
#DATAFRAME WITH DATA
|
| 20 |
+
new_dataframe = pd.DataFrame(columns=person)
|
| 21 |
+
print(new_dataframe.columns)
|
| 22 |
+
|
| 23 |
+
original_dataframe = pd.read_csv('./MLSystem/data/users_dataframe.csv')
|
| 24 |
+
print(original_dataframe.columns)
|
| 25 |
+
|
| 26 |
+
id = original_dataframe["id"].iloc[-1]
|
| 27 |
+
last_id = id + 1
|
| 28 |
+
|
| 29 |
+
new_row_data = [last_id, "John", "Doe", 30, "john.doe@example.com", "Morning", "Night",
|
| 30 |
+
"Bachelor", "Yes", "Yes", "Yes", "No", "Yes"]
|
| 31 |
+
new_row_dataframe = pd.DataFrame([new_row_data], columns=person)
|
| 32 |
+
self.result_data = pd.concat([original_dataframe, new_row_dataframe]).set_index("id")
|
| 33 |
+
|
| 34 |
+
def data_checking(self,dataframe):
|
| 35 |
+
for col in dataframe.columns:
|
| 36 |
+
if dataframe[col].isnull().sum() > 0:
|
| 37 |
+
print(f"Missing values in {col} column")
|
| 38 |
+
else:
|
| 39 |
+
print(f"No missing values in column {col}")
|
| 40 |
+
|
| 41 |
+
def reshape_playground(self,data):
|
| 42 |
+
print(f"Data shape {data.shape}")
|
| 43 |
+
data[50].reshape(17,1)
|
| 44 |
+
print(data.shape)
|
| 45 |
+
|
| 46 |
+
def forward_algorithm(self,dataframe,cluster_spec):
|
| 47 |
+
cluster_spec = dataframe[0]
|
| 48 |
+
kmeans = KMeans(n_clusters=4, random_state=42)
|
| 49 |
+
result = kmeans.fit_predict(dataframe)
|
| 50 |
+
print(result[1])
|
| 51 |
+
print(result)
|
| 52 |
+
|
| 53 |
+
def set_specific_cluster(self,dataframe,cluster_spec):
|
| 54 |
+
cluster_spec = dataframe[cluster_spec]
|
| 55 |
+
distances = [euclidean(cluster_spec,point) for point in dataframe]
|
| 56 |
+
print(distances)
|
| 57 |
+
|
| 58 |
+
def specific_cluster_kmeans(self):
|
| 59 |
+
id = self.result_data["id"].iloc[-1]
|
| 60 |
+
print(self.result_data)
|
| 61 |
+
cluster_spec = self.result_data[id]
|
| 62 |
+
cluster_spec = cluster_spec.reshape(1,17)
|
| 63 |
+
print(cluster_spec.shape)
|
| 64 |
+
kmeans = KMeans(n_clusters=1, init = cluster_spec, n_init = 1, random_state=42)
|
| 65 |
+
data = kmeans.fit(dataframe)
|
| 66 |
+
joblib.dump(kmeans, 'MLSystem/kmeans_model.pkl')
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
my_kmeans = k_means_algo()
|
| 71 |
+
person = ["id","Names","Surnames","Age","Email","Worktimes","Schedules","Studies level","Pets","Cooking","Sport","Smoking","Organized"]
|
| 72 |
+
|
| 73 |
+
my_kmeans.data_preparing(person)
|
| 74 |
+
|
| 75 |
+
#forward_algorithm(dataframe,50)
|
| 76 |
+
#set_specific_cluster(dataframe,50)
|
| 77 |
+
#my_kmeans.specific_cluster_kmeans()
|
kmeans_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe0628e084eabce82fe327dfaff97e43e7d91613a26669b53ea405734a696901
|
| 3 |
+
size 41063
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
scikit-learn
|
| 4 |
+
pandas
|
| 5 |
+
numpy
|
| 6 |
+
joblib
|
resultview.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
def tenant_visualization(similarity_matrix, requested_tenants):
|
| 4 |
+
#TODO VIEW COMPATIBILITY BETWEEN REQUESTED TENANTS
|
| 5 |
+
print(f"The options for visualizatio are:\n"+
|
| 6 |
+
"1.Compatibility for each tenants with requested ones\n"+
|
| 7 |
+
"2.Mean compatibility for each tenant\n"+
|
| 8 |
+
"3.Most compatible tenant for the requested ones\n"+
|
| 9 |
+
"4.Obtain registers from most compatible tenants\n")
|
| 10 |
+
option = int(input("What option are u choosing: "))
|
| 11 |
+
tenant_lines = similarity_matrix[requested_tenants].mean()
|
| 12 |
+
match(option):
|
| 13 |
+
|
| 14 |
+
case 1:
|
| 15 |
+
tenant_lines.head(5)
|
| 16 |
+
print(f"Compatibility for each tenants with requested ones \n{tenant_lines}")
|
| 17 |
+
|
| 18 |
+
case 2:
|
| 19 |
+
mean_compatibility = tenant_lines.mean(axis = 0)
|
| 20 |
+
print(f"Mean compatibility for each tenant:\n {mean_compatibility} ")
|
| 21 |
+
|
| 22 |
+
case 3:
|
| 23 |
+
most_compatible = tenant_lines.sort_values(ascending = False)
|
| 24 |
+
print(f"Max compatibility for each tenant:\n {most_compatible} ")
|
| 25 |
+
|
| 26 |
+
case 4:
|
| 27 |
+
most_compatible = tenant_lines.sort_values(ascending = False)
|
| 28 |
+
most_compatible = dataframe.loc[requested_tenants]
|
| 29 |
+
print(f"Most compatible tenants registers\n {most_compatible}")
|
| 30 |
+
|
| 31 |
+
def view_kmeans_results(results,cluster_center):
|
| 32 |
+
# TODO FUNCTION TO VIEW KMEANS RESULTS
|
| 33 |
+
print(f"Starting kmeans viewing \n Cluster length: {results.shape}")
|
| 34 |
+
plt.scatter(results,results)
|
| 35 |
+
plt.show()
|
| 36 |
+
|
| 37 |
+
def tenant_inference(similarity_matrix, requested_tenants,dataframe):
|
| 38 |
+
#TODO THIS FUNCTION IS THE ONE USED DURING INFERENCE TIME THE MODEL WILL CALCULATE THE 4 TENANTS WITH THE HIGHER COMPATIBILITY
|
| 39 |
+
similarity_tenant = similarity_matrix[requested_tenants].head(4).sort_values(ascending = False)
|
| 40 |
+
final_tenants = similarity_tenant.index
|
| 41 |
+
tenant_list = []
|
| 42 |
+
|
| 43 |
+
for tenant in final_tenants:
|
| 44 |
+
similarity = similarity_matrix[requested_tenants][tenant]
|
| 45 |
+
tenant_tuple = (dataframe.loc[tenant, ['Names', 'Age','Smoking','Email']], float(similarity))
|
| 46 |
+
tenant_list.append(tenant_tuple)
|
| 47 |
+
|
| 48 |
+
#TODO WE CAN ACCESS THE INFO BY DOING tenant_list[index]['Column_Name]'
|
| 49 |
+
#for tenant in tenant_list:
|
| 50 |
+
#print(f"I will present the names of the tenants with the higher similarity: \n {tenant[0]['Names']}")
|
| 51 |
+
return tenant_list
|
| 52 |
+
|
| 53 |
+
def algo_start(id):
|
| 54 |
+
|
| 55 |
+
dataframe, original_dataframe = sm.data_preparing()
|
| 56 |
+
#sm.data_checking(dataframe)
|
| 57 |
+
similarity_matrix = sm.encoder_matrix(dataframe, min_range = 0, max_range=100)
|
| 58 |
+
tenant_list = tenant_inference(similarity_matrix, id,original_dataframe)
|
| 59 |
+
return tenant_list
|
| 60 |
+
|
| 61 |
+
#tenant_visualization(similarity_matrix, [20,40,50,18,15])
|
similarity.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from sklearn.preprocessing import OneHotEncoder
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
def data_preparing():
|
| 6 |
+
original_dataframe = pd.read_csv('./MLSystem/data/users_dataframe.csv')
|
| 7 |
+
columns = ['Age', 'Worktimes', 'Schedules', 'Studies level', 'Pets', 'Cooking', 'Sport', 'Smoking', 'Organized']
|
| 8 |
+
dataframe = original_dataframe[columns]
|
| 9 |
+
return dataframe, original_dataframe
|
| 10 |
+
|
| 11 |
+
def data_checking(dataframe):
|
| 12 |
+
for col in dataframe.columns:
|
| 13 |
+
if dataframe[col].isnull().sum() > 0:
|
| 14 |
+
print(f"Missing values in {col} column")
|
| 15 |
+
else:
|
| 16 |
+
print(f"No missing values in column {col}")
|
| 17 |
+
|
| 18 |
+
def encoder_matrix(dataframe, min_range, max_range):
|
| 19 |
+
encoder = OneHotEncoder(sparse_output = False)
|
| 20 |
+
data_encoded = encoder.fit_transform(dataframe)
|
| 21 |
+
encoded_feature_names = encoder.get_feature_names_out()
|
| 22 |
+
|
| 23 |
+
matriz_s = np.dot(data_encoded, data_encoded.T)
|
| 24 |
+
|
| 25 |
+
min_original = np.min(matriz_s)
|
| 26 |
+
max_original = np.max(matriz_s)
|
| 27 |
+
matriz_reescalada = ((matriz_s-min_original) / (max_original - min_original))*(max_range - min_range) + min_range
|
| 28 |
+
|
| 29 |
+
new_similarity_matrix = pd.DataFrame(matriz_reescalada, index = dataframe.index, columns = dataframe.index)
|
| 30 |
+
return new_similarity_matrix
|
| 31 |
+
|