File size: 3,712 Bytes
7958e55
 
 
 
 
 
 
0b6fe9c
 
 
 
 
 
 
 
7958e55
 
 
0b6fe9c
 
 
7958e55
 
 
0b6fe9c
 
 
7958e55
 
 
 
 
0b6fe9c
 
7958e55
 
 
 
 
 
 
 
0b6fe9c
7958e55
0b6fe9c
7958e55
0b6fe9c
7958e55
0b6fe9c
7958e55
 
 
 
0b6fe9c
7958e55
0b6fe9c
7958e55
 
 
 
 
0b6fe9c
7958e55
 
0b6fe9c
7958e55
 
 
 
 
 
 
0b6fe9c
7958e55
0b6fe9c
7958e55
 
 
 
 
 
0b6fe9c
7958e55
 
 
 
0b6fe9c
7958e55
 
 
 
 
 
 
 
 
0b6fe9c
 
7958e55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import Dict, List


def safe_normalize(v: np.ndarray) -> np.ndarray:
    """Avoid division by zero when normalizing vectors."""
    norm = np.linalg.norm(v, axis=1, keepdims=True)
    norm[norm == 0] = 1e-6  # prevent division by 0
    return v / norm


class RoommateMatcher:
    def __init__(self):
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.financial_encoder = OneHotEncoder(
            sparse_output=False, handle_unknown="ignore"
        )
        self.scaler = MinMaxScaler()
        self.is_fitted = False

        # Fit encoder in advance with known categories to avoid all-zero rows
        self.financial_encoder.fit([["split-rent"], ["single-payment"]])

    def predict(self, current_user: Dict, other_users: List[Dict]) -> List[Dict]:
        if not self.is_fitted and other_users:
            self._fit_encoders(other_users)

        others_df = pd.DataFrame(other_users)

        # === TEXT VECTOR ===
        others_df['combined_text'] = others_df.apply(
            lambda x: " ".join(filter(None, [
                str(x.get('personal_description', '')),
                str(x.get('occupation', '')),
                *[str(s) for s in x.get('social_preference', [])]
            ])), axis=1
        )
        text_embeds = self.text_model.encode(others_df['combined_text'].tolist())
        text_block = safe_normalize(text_embeds)

        # === FINANCIAL VECTOR ===
        fin_block = self.financial_encoder.transform(others_df[['financials']])
        fin_block = safe_normalize(fin_block)

        # === NUMERIC VECTOR ===
        num_features = np.hstack([
            np.array([x for x in others_df['location']]),
            others_df[['budget_min', 'budget_max']].values
        ])
        num_block = safe_normalize(self.scaler.transform(num_features))

        # === CURRENT USER VECTORS ===
        current_text = self.text_model.encode(" ".join(filter(None, [
            str(current_user.get('personal_description', '')),
            str(current_user.get('occupation', '')),
            *[str(s) for s in current_user.get('social_preference', [])]
        ])))
        current_text = safe_normalize(current_text.reshape(1, -1))

        current_fin = self.financial_encoder.transform([[current_user['financials']]])
        current_fin = safe_normalize(current_fin)

        current_num = self.scaler.transform([[
            current_user['location'][0],
            current_user['location'][1],
            current_user['budget_min'],
            current_user['budget_max']
        ]])
        current_num = safe_normalize(current_num)

        # === STACK FEATURES ===
        combined_existing = np.hstack([
            text_block * 0.6,
            fin_block * 0.1,
            num_block * 0.3
        ])
        current_block = np.hstack([
            current_text * 0.6,
            current_fin * 0.2,
            current_num * 0.2
        ])

        # === SIMILARITY ===
        others_df['similarity'] = np.round(
            cosine_similarity(current_block, combined_existing)[0] * 100, 2
        )

        return others_df.sort_values('similarity', ascending=False).head(10).to_dict('records')

    def _fit_encoders(self, users: List[Dict]):
        locations = np.array([u['location'] for u in users])
        budgets = np.array([[u['budget_min'], u['budget_max']] for u in users])
        numeric_block = np.hstack([locations, budgets])
        self.scaler.fit(numeric_block)
        self.is_fitted = True