File size: 3,712 Bytes
7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 0b6fe9c 7958e55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import Dict, List
def safe_normalize(v: np.ndarray) -> np.ndarray:
"""Avoid division by zero when normalizing vectors."""
norm = np.linalg.norm(v, axis=1, keepdims=True)
norm[norm == 0] = 1e-6 # prevent division by 0
return v / norm
class RoommateMatcher:
def __init__(self):
self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
self.financial_encoder = OneHotEncoder(
sparse_output=False, handle_unknown="ignore"
)
self.scaler = MinMaxScaler()
self.is_fitted = False
# Fit encoder in advance with known categories to avoid all-zero rows
self.financial_encoder.fit([["split-rent"], ["single-payment"]])
def predict(self, current_user: Dict, other_users: List[Dict]) -> List[Dict]:
if not self.is_fitted and other_users:
self._fit_encoders(other_users)
others_df = pd.DataFrame(other_users)
# === TEXT VECTOR ===
others_df['combined_text'] = others_df.apply(
lambda x: " ".join(filter(None, [
str(x.get('personal_description', '')),
str(x.get('occupation', '')),
*[str(s) for s in x.get('social_preference', [])]
])), axis=1
)
text_embeds = self.text_model.encode(others_df['combined_text'].tolist())
text_block = safe_normalize(text_embeds)
# === FINANCIAL VECTOR ===
fin_block = self.financial_encoder.transform(others_df[['financials']])
fin_block = safe_normalize(fin_block)
# === NUMERIC VECTOR ===
num_features = np.hstack([
np.array([x for x in others_df['location']]),
others_df[['budget_min', 'budget_max']].values
])
num_block = safe_normalize(self.scaler.transform(num_features))
# === CURRENT USER VECTORS ===
current_text = self.text_model.encode(" ".join(filter(None, [
str(current_user.get('personal_description', '')),
str(current_user.get('occupation', '')),
*[str(s) for s in current_user.get('social_preference', [])]
])))
current_text = safe_normalize(current_text.reshape(1, -1))
current_fin = self.financial_encoder.transform([[current_user['financials']]])
current_fin = safe_normalize(current_fin)
current_num = self.scaler.transform([[
current_user['location'][0],
current_user['location'][1],
current_user['budget_min'],
current_user['budget_max']
]])
current_num = safe_normalize(current_num)
# === STACK FEATURES ===
combined_existing = np.hstack([
text_block * 0.6,
fin_block * 0.1,
num_block * 0.3
])
current_block = np.hstack([
current_text * 0.6,
current_fin * 0.2,
current_num * 0.2
])
# === SIMILARITY ===
others_df['similarity'] = np.round(
cosine_similarity(current_block, combined_existing)[0] * 100, 2
)
return others_df.sort_values('similarity', ascending=False).head(10).to_dict('records')
def _fit_encoders(self, users: List[Dict]):
locations = np.array([u['location'] for u in users])
budgets = np.array([[u['budget_min'], u['budget_max']] for u in users])
numeric_block = np.hstack([locations, budgets])
self.scaler.fit(numeric_block)
self.is_fitted = True
|