import numpy as np import pandas as pd from sklearn.preprocessing import OneHotEncoder, MinMaxScaler from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer from typing import Dict, List def safe_normalize(v: np.ndarray) -> np.ndarray: """Avoid division by zero when normalizing vectors.""" norm = np.linalg.norm(v, axis=1, keepdims=True) norm[norm == 0] = 1e-6 # prevent division by 0 return v / norm class RoommateMatcher: def __init__(self): self.text_model = SentenceTransformer('all-MiniLM-L6-v2') self.financial_encoder = OneHotEncoder( sparse_output=False, handle_unknown="ignore" ) self.scaler = MinMaxScaler() self.is_fitted = False # Fit encoder in advance with known categories to avoid all-zero rows self.financial_encoder.fit([["split-rent"], ["single-payment"]]) def predict(self, current_user: Dict, other_users: List[Dict]) -> List[Dict]: if not self.is_fitted and other_users: self._fit_encoders(other_users) others_df = pd.DataFrame(other_users) # === TEXT VECTOR === others_df['combined_text'] = others_df.apply( lambda x: " ".join(filter(None, [ str(x.get('personal_description', '')), str(x.get('occupation', '')), *[str(s) for s in x.get('social_preference', [])] ])), axis=1 ) text_embeds = self.text_model.encode(others_df['combined_text'].tolist()) text_block = safe_normalize(text_embeds) # === FINANCIAL VECTOR === fin_block = self.financial_encoder.transform(others_df[['financials']]) fin_block = safe_normalize(fin_block) # === NUMERIC VECTOR === num_features = np.hstack([ np.array([x for x in others_df['location']]), others_df[['budget_min', 'budget_max']].values ]) num_block = safe_normalize(self.scaler.transform(num_features)) # === CURRENT USER VECTORS === current_text = self.text_model.encode(" ".join(filter(None, [ str(current_user.get('personal_description', '')), str(current_user.get('occupation', '')), *[str(s) for s in current_user.get('social_preference', [])] ]))) current_text = safe_normalize(current_text.reshape(1, -1)) current_fin = self.financial_encoder.transform([[current_user['financials']]]) current_fin = safe_normalize(current_fin) current_num = self.scaler.transform([[ current_user['location'][0], current_user['location'][1], current_user['budget_min'], current_user['budget_max'] ]]) current_num = safe_normalize(current_num) # === STACK FEATURES === combined_existing = np.hstack([ text_block * 0.6, fin_block * 0.1, num_block * 0.3 ]) current_block = np.hstack([ current_text * 0.6, current_fin * 0.2, current_num * 0.2 ]) # === SIMILARITY === others_df['similarity'] = np.round( cosine_similarity(current_block, combined_existing)[0] * 100, 2 ) return others_df.sort_values('similarity', ascending=False).head(10).to_dict('records') def _fit_encoders(self, users: List[Dict]): locations = np.array([u['location'] for u in users]) budgets = np.array([[u['budget_min'], u['budget_max']] for u in users]) numeric_block = np.hstack([locations, budgets]) self.scaler.fit(numeric_block) self.is_fitted = True