File size: 4,118 Bytes
b9a4372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def profile_to_embedding(profile):
    """

    Convert a normalized user profile into a fixed-length numeric embedding.

    Embedding structure:

    [diet (3)] + [allergies (6)] + [region (6)] +

    [nutritional goals (4)] + [preferred_main (8)] + [cooking_time (1)]

    Total dim ≈ 28

    """
    vecs = []

    # 1. Diet (one-hot)
    diet_types = ["vegetarian", "flexible", "non_vegetarian"]
    diet_vec = np.zeros(len(diet_types))
    diet_value = profile.get("diet", {}).get("vegetarian_type", "flexible")
    if diet_value in diet_types:
        diet_vec[diet_types.index(diet_value)] = 1
    vecs.append(diet_vec)

    # 2. Allergies (multi-hot)
    allergy_vocab = ["milk", "gluten", "peanut", "shrimp", "egg", "soy"]
    allergies = set(profile.get("allergies", []))
    allergy_vec = np.array([1 if a in allergies else 0 for a in allergy_vocab])
    vecs.append(allergy_vec)

    # 3. Region preferences (multi-hot)
    region_vocab = ["North America", "Latin America", "Europe", "Asia", "Middle East", "Africa"]
    regions = set(profile.get("region_preference", []))
    region_vec = np.array([1 if r in regions else 0 for r in region_vocab])
    vecs.append(region_vec)

    # 4. Nutritional goals (normalized)
    ng = profile.get("nutritional_goals", {})
    cal = ng.get("calories", {})
    pro = ng.get("protein", {})

    cal_min = cal.get("min", 0) / 4000
    cal_max = min(cal.get("max", 9999), 4000) / 4000
    pro_min = pro.get("min", 0) / 300
    pro_max = min(pro.get("max", 999), 300) / 300

    vecs.append(np.array([cal_min, cal_max, pro_min, pro_max]))

    # 5. Preferred main ingredients (multi-hot)
    main_vocab = ["chicken", "tofu", "beef", "salmon", "eggs", "pork", "beans", "mushroom"]
    mains = set(profile.get("other_preferences", {}).get("preferred_main", []))
    main_vec = np.array([1 if m in mains else 0 for m in main_vocab])
    vecs.append(main_vec)

    # 6. Cooking time max (normalized to [0,1], assume 120 min upper bound)
    t = profile.get("other_preferences", {}).get("cooking_time_max")
    t_vec = np.array([min(t / 120, 1)]) if t is not None else np.array([0])
    vecs.append(t_vec)

    return np.concatenate(vecs)


def profile_similarity(profile_a, profile_b):
    """Compute cosine similarity between two user profiles."""
    emb_a = profile_to_embedding(profile_a).reshape(1, -1)
    emb_b = profile_to_embedding(profile_b).reshape(1, -1)
    return cosine_similarity(emb_a, emb_b)[0, 0]

def find_most_similar_user(target_user_id, user_data_dir="recipe_recommendation/user_data", threshold=0.85):
    """

    Find the most similar existing user based on profile embeddings.

    Returns (best_match_user_id, similarity_score) or (None, -1) if no match.

    """
    target_profile_path = os.path.join(user_data_dir, target_user_id, "user_profile.json")
    if not os.path.exists(target_profile_path):
        raise FileNotFoundError(f"[embedding] No profile found for user {target_user_id}")

    with open(target_profile_path, "r", encoding="utf-8") as f:
        target_profile = json.load(f)
    target_emb = profile_to_embedding(target_profile).reshape(1, -1)

    best_match, best_score = None, -1

    for uid in os.listdir(user_data_dir):
        if uid == target_user_id:
            continue
        profile_path = os.path.join(user_data_dir, uid, "user_profile.json")
        if not os.path.exists(profile_path):
            continue
        with open(profile_path, "r", encoding="utf-8") as f:
            other_profile = json.load(f)
        other_emb = profile_to_embedding(other_profile).reshape(1, -1)
        sim = cosine_similarity(target_emb, other_emb)[0, 0]
        if sim > best_score:
            best_match, best_score = uid, sim

    if best_match and best_score >= threshold:
        print(f"[embedding] Found similar user: {best_match} (similarity={best_score:.3f})")
        return best_match, best_score

    return None, -1