|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from typing import Dict, List |
|
|
|
|
|
|
|
|
def safe_normalize(v: np.ndarray) -> np.ndarray: |
|
|
"""Avoid division by zero when normalizing vectors.""" |
|
|
norm = np.linalg.norm(v, axis=1, keepdims=True) |
|
|
norm[norm == 0] = 1e-6 |
|
|
return v / norm |
|
|
|
|
|
|
|
|
class RoommateMatcher: |
|
|
def __init__(self): |
|
|
self.text_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
self.financial_encoder = OneHotEncoder( |
|
|
sparse_output=False, handle_unknown="ignore" |
|
|
) |
|
|
self.scaler = MinMaxScaler() |
|
|
self.is_fitted = False |
|
|
|
|
|
|
|
|
self.financial_encoder.fit([["split-rent"], ["single-payment"]]) |
|
|
|
|
|
def predict(self, current_user: Dict, other_users: List[Dict]) -> List[Dict]: |
|
|
if not self.is_fitted and other_users: |
|
|
self._fit_encoders(other_users) |
|
|
|
|
|
others_df = pd.DataFrame(other_users) |
|
|
|
|
|
|
|
|
others_df['combined_text'] = others_df.apply( |
|
|
lambda x: " ".join(filter(None, [ |
|
|
str(x.get('personal_description', '')), |
|
|
str(x.get('occupation', '')), |
|
|
*[str(s) for s in x.get('social_preference', [])] |
|
|
])), axis=1 |
|
|
) |
|
|
text_embeds = self.text_model.encode(others_df['combined_text'].tolist()) |
|
|
text_block = safe_normalize(text_embeds) |
|
|
|
|
|
|
|
|
fin_block = self.financial_encoder.transform(others_df[['financials']]) |
|
|
fin_block = safe_normalize(fin_block) |
|
|
|
|
|
|
|
|
num_features = np.hstack([ |
|
|
np.array([x for x in others_df['location']]), |
|
|
others_df[['budget_min', 'budget_max']].values |
|
|
]) |
|
|
num_block = safe_normalize(self.scaler.transform(num_features)) |
|
|
|
|
|
|
|
|
current_text = self.text_model.encode(" ".join(filter(None, [ |
|
|
str(current_user.get('personal_description', '')), |
|
|
str(current_user.get('occupation', '')), |
|
|
*[str(s) for s in current_user.get('social_preference', [])] |
|
|
]))) |
|
|
current_text = safe_normalize(current_text.reshape(1, -1)) |
|
|
|
|
|
current_fin = self.financial_encoder.transform([[current_user['financials']]]) |
|
|
current_fin = safe_normalize(current_fin) |
|
|
|
|
|
current_num = self.scaler.transform([[ |
|
|
current_user['location'][0], |
|
|
current_user['location'][1], |
|
|
current_user['budget_min'], |
|
|
current_user['budget_max'] |
|
|
]]) |
|
|
current_num = safe_normalize(current_num) |
|
|
|
|
|
|
|
|
combined_existing = np.hstack([ |
|
|
text_block * 0.6, |
|
|
fin_block * 0.1, |
|
|
num_block * 0.3 |
|
|
]) |
|
|
current_block = np.hstack([ |
|
|
current_text * 0.6, |
|
|
current_fin * 0.2, |
|
|
current_num * 0.2 |
|
|
]) |
|
|
|
|
|
|
|
|
others_df['similarity'] = np.round( |
|
|
cosine_similarity(current_block, combined_existing)[0] * 100, 2 |
|
|
) |
|
|
|
|
|
return others_df.sort_values('similarity', ascending=False).head(10).to_dict('records') |
|
|
|
|
|
def _fit_encoders(self, users: List[Dict]): |
|
|
locations = np.array([u['location'] for u in users]) |
|
|
budgets = np.array([[u['budget_min'], u['budget_max']] for u in users]) |
|
|
numeric_block = np.hstack([locations, budgets]) |
|
|
self.scaler.fit(numeric_block) |
|
|
self.is_fitted = True |
|
|
|