Spaces:

Janick1
/

matches

Sleeping

matches / app /model.py

Joseph Ibochi

update:model mod for handling NAN

0b6fe9c 7 months ago

3.71 kB

	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
	from sklearn.metrics.pairwise import cosine_similarity
	from sentence_transformers import SentenceTransformer
	from typing import Dict, List


	def safe_normalize(v: np.ndarray) -> np.ndarray:
	"""Avoid division by zero when normalizing vectors."""
	norm = np.linalg.norm(v, axis=1, keepdims=True)
	norm[norm == 0] = 1e-6 # prevent division by 0
	return v / norm


	class RoommateMatcher:
	def __init__(self):
	self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
	self.financial_encoder = OneHotEncoder(
	sparse_output=False, handle_unknown="ignore"
	)
	self.scaler = MinMaxScaler()
	self.is_fitted = False

	# Fit encoder in advance with known categories to avoid all-zero rows
	self.financial_encoder.fit([["split-rent"], ["single-payment"]])

	def predict(self, current_user: Dict, other_users: List[Dict]) -> List[Dict]:
	if not self.is_fitted and other_users:
	self._fit_encoders(other_users)

	others_df = pd.DataFrame(other_users)

	# === TEXT VECTOR ===
	others_df['combined_text'] = others_df.apply(
	lambda x: " ".join(filter(None, [
	str(x.get('personal_description', '')),
	str(x.get('occupation', '')),
	*[str(s) for s in x.get('social_preference', [])]
	])), axis=1
	)
	text_embeds = self.text_model.encode(others_df['combined_text'].tolist())
	text_block = safe_normalize(text_embeds)

	# === FINANCIAL VECTOR ===
	fin_block = self.financial_encoder.transform(others_df[['financials']])
	fin_block = safe_normalize(fin_block)

	# === NUMERIC VECTOR ===
	num_features = np.hstack([
	np.array([x for x in others_df['location']]),
	others_df[['budget_min', 'budget_max']].values
	])
	num_block = safe_normalize(self.scaler.transform(num_features))

	# === CURRENT USER VECTORS ===
	current_text = self.text_model.encode(" ".join(filter(None, [
	str(current_user.get('personal_description', '')),
	str(current_user.get('occupation', '')),
	*[str(s) for s in current_user.get('social_preference', [])]
	])))
	current_text = safe_normalize(current_text.reshape(1, -1))

	current_fin = self.financial_encoder.transform([[current_user['financials']]])
	current_fin = safe_normalize(current_fin)

	current_num = self.scaler.transform([[
	current_user['location'][0],
	current_user['location'][1],
	current_user['budget_min'],
	current_user['budget_max']
	]])
	current_num = safe_normalize(current_num)

	# === STACK FEATURES ===
	combined_existing = np.hstack([
	text_block * 0.6,
	fin_block * 0.1,
	num_block * 0.3
	])
	current_block = np.hstack([
	current_text * 0.6,
	current_fin * 0.2,
	current_num * 0.2
	])

	# === SIMILARITY ===
	others_df['similarity'] = np.round(
	cosine_similarity(current_block, combined_existing)[0] * 100, 2
	)

	return others_df.sort_values('similarity', ascending=False).head(10).to_dict('records')

	def _fit_encoders(self, users: List[Dict]):
	locations = np.array([u['location'] for u in users])
	budgets = np.array([[u['budget_min'], u['budget_max']] for u in users])
	numeric_block = np.hstack([locations, budgets])
	self.scaler.fit(numeric_block)
	self.is_fitted = True