Iris314 commited on
Commit
b9a4372
·
verified ·
1 Parent(s): dce0f27

Upload 8 files

Browse files
recipe_recommendation/src/__init__.py ADDED
File without changes
recipe_recommendation/src/candidate.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from .feature import extract_features
4
+ from .io import load_ingredient_map
5
+ import joblib
6
+
7
+ # Load ingredient map globally to avoid repeated I/O
8
+ INGREDIENT_MAP = load_ingredient_map()
9
+ PARENTS = INGREDIENT_MAP["parents"]
10
+ CHILDREN = INGREDIENT_MAP["children"]
11
+
12
+ def extract_user_parents(user_ingredients):
13
+ """Map user's ingredients to parent categories"""
14
+ user_parents = set()
15
+ for ing in user_ingredients:
16
+ ing_lower = ing.lower().strip()
17
+ if ing_lower in CHILDREN:
18
+ parent = CHILDREN[ing_lower]["parent"]
19
+ user_parents.add(parent)
20
+ elif ing_lower in PARENTS:
21
+ user_parents.add(ing_lower)
22
+ return user_parents
23
+
24
+
25
+ # def hard_filter(recipe, user_profile):
26
+ # diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
27
+ # if diet == "vegan" and not recipe.get("is_vegan_safe", True):
28
+ # return False
29
+ # if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
30
+ # return False
31
+ # return True
32
+
33
+ def hard_filter(recipe: dict, user_profile: dict, debug=False) -> bool:
34
+ """
35
+ Minimal hard filter: only vegan/vegetarian & disliked main.
36
+ """
37
+ recipe_name = recipe.get("name", "Unknown")
38
+
39
+ # --- Dietary filter ---
40
+ diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
41
+ if diet == "vegan" and not recipe.get("is_vegan_safe", True):
42
+ if debug:
43
+ print(f"❌ {recipe_name}: Not vegan-safe")
44
+ return False
45
+ if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
46
+ if debug:
47
+ print(f"❌ {recipe_name}: Not vegetarian-safe")
48
+ return False
49
+
50
+ # --- Disliked main ingredients filter ---
51
+ disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
52
+ if disliked_main:
53
+ recipe_main = recipe.get("main_parent", set())
54
+ if isinstance(recipe_main, list):
55
+ recipe_main = set(recipe_main)
56
+ elif not isinstance(recipe_main, set):
57
+ recipe_main = set()
58
+
59
+ overlap = recipe_main & disliked_main
60
+ if overlap:
61
+ if debug:
62
+ print(f"❌ {recipe_name}: Contains disliked {overlap}")
63
+ return False
64
+
65
+ if debug:
66
+ print(f"✅ {recipe_name}: PASS hard filter")
67
+
68
+ return True
69
+
70
+
71
+
72
+ COARSE_WEIGHTS = {
73
+ "main_match_ratio": 1.0,
74
+ "staple_match_ratio": 0.3,
75
+ "other_match_ratio": 0.6,
76
+ "low_calorie_penalty": 0.2,
77
+ "preferred_course_overlap": 0.1,
78
+ "region_match": 0.8,
79
+ "preferred_main_overlap": 1
80
+ }
81
+
82
+
83
+ def coarse_score(features, weights=COARSE_WEIGHTS):
84
+ score = 0.0
85
+ for key, w in weights.items():
86
+ if key in features:
87
+ score += w * features[key]
88
+ return score
89
+
90
+
91
+ def coarse_rank_candidates(recipes, user_parents, user_profile, top_n=30000, weights=COARSE_WEIGHTS):
92
+ """
93
+ Stage 2: Coarse Ranking (NumPy vectorized implementation)
94
+ ---------------------------------------------------------
95
+ Quickly retrieves a subset of candidate recipes by computing
96
+ ingredient coverage ratios (main / staple / other) between
97
+ the user's pantry and the recipes using vectorized operations.
98
+
99
+ This function replaces the original Python loop version
100
+ for significant speedup during cold start and real-time ranking.
101
+ """
102
+ if not recipes:
103
+ return []
104
+
105
+ # === 1. Build parent vocabulary ===
106
+ # Extract all unique parent ingredients across main/staple/other fields.
107
+ all_parents = sorted({
108
+ p for r in recipes
109
+ for k in ["main_parent", "staple_parent", "other_parent"]
110
+ for p in (r.get(k) or [])
111
+ })
112
+ parent_index = {p: i for i, p in enumerate(all_parents)}
113
+ num_recipes = len(recipes)
114
+ num_parents = len(all_parents)
115
+
116
+ # === 2. Construct multi-hot matrices for main, staple, other ===
117
+ # Each row corresponds to a recipe; each column to a parent ingredient.
118
+ main_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
119
+ staple_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
120
+ other_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
121
+
122
+ for i, r in enumerate(recipes):
123
+ for p in r.get("main_parent", []):
124
+ if p in parent_index:
125
+ main_mat[i, parent_index[p]] = 1
126
+ for p in r.get("staple_parent", []):
127
+ if p in parent_index:
128
+ staple_mat[i, parent_index[p]] = 1
129
+ for p in r.get("other_parent", []):
130
+ if p in parent_index:
131
+ other_mat[i, parent_index[p]] = 1
132
+
133
+ # === 3. Encode user pantry as a binary mask ===
134
+ user_mask = np.zeros(num_parents, dtype=np.uint8)
135
+ for p in user_parents:
136
+ if p in parent_index:
137
+ user_mask[parent_index[p]] = 1
138
+
139
+ # === 4. Compute ingredient match ratios in batch ===
140
+ # main_ratio = (# of matched main ingredients) / (# of total main ingredients)
141
+ main_total = main_mat.sum(axis=1)
142
+ staple_total = staple_mat.sum(axis=1)
143
+ other_total = other_mat.sum(axis=1)
144
+
145
+ main_match = (main_mat @ user_mask)
146
+ staple_match = (staple_mat @ user_mask)
147
+ other_match = (other_mat @ user_mask)
148
+
149
+ main_ratio = main_match / np.maximum(main_total, 1)
150
+ staple_ratio = staple_match / np.maximum(staple_total, 1)
151
+ other_ratio = other_match / np.maximum(other_total, 1)
152
+
153
+ # === 5. Additional coarse ranking signals ===
154
+ # Low-calorie preference & preferred cuisine overlap
155
+ calories = np.array([r.get("calories", 0) for r in recipes], dtype=float)
156
+ calorie_threshold = user_profile.get("calorie_threshold", 9999)
157
+ low_calorie_penalty = (calories <= calorie_threshold).astype(float)
158
+
159
+ preferred_course_types = set(user_profile.get("preferred_course_types", []))
160
+ preferred_overlap = np.array([
161
+ len(set(r.get("cuisine_attr", [])) & preferred_course_types)
162
+ for r in recipes
163
+ ], dtype=float)
164
+
165
+ # Region preference matching
166
+ preferred_regions = set(user_profile.get("region_preference", []))
167
+ region_match = np.array([
168
+ 1.0 if any(region in preferred_regions for region in
169
+ (r.get("region", []) if isinstance(r.get("region"), (list, set))
170
+ else [r.get("region", "")]))
171
+ else 0.0
172
+ for r in recipes
173
+ ], dtype=float)
174
+
175
+ # === Preferred main ingredients ===
176
+ preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
177
+
178
+ if preferred_main:
179
+ preferred_main_overlap = np.array([
180
+ len(set(r.get("main_parent", [])) & preferred_main)
181
+ for r in recipes
182
+ ], dtype=float)
183
+ # print(f"[coarse_rank] Preferred main: {preferred_main}, matches: {np.sum(preferred_main_overlap > 0)}")
184
+ else:
185
+ preferred_main_overlap = np.zeros(len(recipes))
186
+
187
+
188
+ # === 6. Compute coarse ranking scores ===
189
+ scores = (
190
+ weights["main_match_ratio"] * main_ratio +
191
+ weights["staple_match_ratio"] * staple_ratio +
192
+ weights["other_match_ratio"] * other_ratio +
193
+ weights["low_calorie_penalty"] * low_calorie_penalty +
194
+ weights["preferred_course_overlap"] * preferred_overlap +
195
+ weights.get("region_match", 0) * region_match +
196
+ weights.get("preferred_main_overlap", 0) * preferred_main_overlap
197
+ )
198
+
199
+ # === 7. Select top-N candidates ===
200
+ valid_idx = np.where(scores > 0)[0]
201
+ if valid_idx.size == 0:
202
+ return []
203
+
204
+ scores_valid = scores[valid_idx]
205
+ topk = min(top_n, valid_idx.size)
206
+
207
+ # Optional dynamic thresholding: keep candidates with score >= 50% of max
208
+ max_score = scores_valid.max()
209
+ keep_mask = scores_valid >= max_score * 0.5
210
+ keep_idx = valid_idx[keep_mask]
211
+
212
+ if keep_idx.size == 0:
213
+ return []
214
+
215
+ order = np.argsort(scores[keep_idx])[::-1]
216
+ top_idx = keep_idx[order[:topk]]
217
+
218
+ # Return the original recipe dicts corresponding to the top candidates
219
+ return [recipes[i] for i in top_idx]
220
+
221
+
222
+ def rule_generate_candidates(df, user_parents, user_profile):
223
+ """
224
+ Step 3: Rule-based reranking of coarse candidates (vectorized).
225
+ This replaces the slow df.apply(score) loop with one-shot feature extraction.
226
+ """
227
+
228
+ if df.empty:
229
+ return df
230
+
231
+ recipes_for_inference = []
232
+ for _, row in df.iterrows():
233
+ recipes_for_inference.append({
234
+ "main": row.get("main_parent", set()),
235
+ "staple": row.get("staple_parent", set()),
236
+ "other": row.get("other_parent", set()),
237
+ "seasoning": row.get("seasoning_parent", set()),
238
+ "matched_main": len(row.get("main_parent", set()) & set(user_parents)),
239
+ "matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
240
+ "matched_other": len(row.get("other_parent", set()) & set(user_parents)),
241
+ "calories": row.get("calories", 0),
242
+ "protein": row.get("protein", 0),
243
+ "fat": row.get("fat", 0),
244
+ "region": row.get("region", ""),
245
+ "cuisine_attr": row.get("cuisine_attr", []),
246
+ "ingredients": row.get("ingredients", []),
247
+ "minutes": row.get("minutes", None),
248
+ })
249
+
250
+ feats_df = extract_features(recipes_for_inference, user_profile)
251
+
252
+ scores = (
253
+ 2.0 * feats_df["main_match_ratio"] +
254
+ 1.0 * feats_df["staple_match_ratio"] +
255
+ 1.0 * feats_df["other_match_ratio"]
256
+ )
257
+
258
+ if user_profile.get("low_calorie", False):
259
+ scores += 0.5 * feats_df["low_calorie_penalty"]
260
+
261
+ if user_profile.get("high_protein", False):
262
+ scores += 0.3 * (feats_df["protein_ratio"] > 0.25)
263
+
264
+ if user_profile.get("low_fat", False):
265
+ scores -= 0.3 * (feats_df["fat_ratio"] > 0.35)
266
+
267
+ scores += 0.5 * feats_df["region_match"]
268
+ scores += 0.4 * feats_df["preferred_course_overlap"]
269
+ scores += 0.3 * feats_df["preferred_main_overlap"]
270
+ scores += 0.3 * feats_df["within_cooking_time"]
271
+ scores -= 0.2 * feats_df["missing_main_count"]
272
+
273
+ df = df.copy()
274
+ df["match_score"] = np.maximum(scores, 0.0)
275
+
276
+ df = df[df["match_score"] > 0]
277
+ if df.empty:
278
+ return df
279
+
280
+ df = df.sort_values("match_score", ascending=False).reset_index(drop=True)
281
+ return df
282
+
283
+
284
+ def ml_generate_candidates(coarse_candidates, user_parents, user_profile, model_path, topk=5):
285
+ """
286
+ Step 3: ML-based reranking (directly after Step 2).
287
+ Instead of rule-based prefiltering, use the coarse-ranked candidates (Step 2 output),
288
+ build features in the same format as training, and apply the trained ML model to rerank.
289
+ """
290
+
291
+ # Handle empty input
292
+ if coarse_candidates is None or len(coarse_candidates) == 0:
293
+ print("No candidates provided for ML reranking.")
294
+ return pd.DataFrame()
295
+
296
+ # If input is a list of dicts (from coarse_rank_candidates), convert to DataFrame
297
+ if isinstance(coarse_candidates, list):
298
+ df = pd.DataFrame(coarse_candidates)
299
+ else:
300
+ df = coarse_candidates.copy()
301
+
302
+ if df.empty:
303
+ print("Coarse candidates DataFrame is empty.")
304
+ return df
305
+
306
+ # Load trained model
307
+ model = joblib.load(model_path)
308
+
309
+ # Build feature DataFrame
310
+ recipes_for_inference = []
311
+
312
+ for _, row in df.iterrows():
313
+ recipes_for_inference.append({
314
+ "main": row.get("main_parent", set()),
315
+ "staple": row.get("staple_parent", set()),
316
+ "other": row.get("other_parent", set()),
317
+ "seasoning": row.get("seasoning_parent", set()),
318
+ "matched_main": len(row.get("main_parent", set()) & set(user_parents)),
319
+ "matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
320
+ "matched_other": len(row.get("other_parent", set()) & set(user_parents)),
321
+ "calories": row.get("calories", 0),
322
+ "protein": row.get("protein", 0),
323
+ "fat": row.get("fat", 0),
324
+ "region": row.get("region", ""),
325
+ "cuisine_attr": row.get("cuisine_attr", []),
326
+ "ingredients": row.get("ingredients", []),
327
+ "minutes": row.get("minutes", None),
328
+ })
329
+
330
+ feature_df = extract_features(recipes_for_inference, user_profile)
331
+
332
+ # Predict ML scores
333
+ if hasattr(model, "predict_proba"):
334
+ df["ml_score"] = model.predict_proba(feature_df)[:, 1]
335
+ else:
336
+ df["ml_score"] = model.predict(feature_df)
337
+
338
+ # normalize to 0-1
339
+ if len(df) > 0 and df["ml_score"].max() > df["ml_score"].min():
340
+ df["ml_score"] = (df["ml_score"] - df["ml_score"].min()) / (df["ml_score"].max() - df["ml_score"].min())
341
+
342
+ # Sort by ML score and return top-k candidates
343
+ return df.sort_values("ml_score", ascending=False).head(topk).reset_index(drop=True)
344
+
345
+
346
+
recipe_recommendation/src/coldstart.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ast
3
+ import json
4
+ import random
5
+ import pandas as pd
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ import warnings
9
+
10
+ from .candidate import coarse_rank_candidates, hard_filter, rule_generate_candidates
11
+ from .feature import build_features
12
+ from .io import load_recipes_csv, load_ingredient_map
13
+
14
+ RECIPES_PATH = load_recipes_csv()
15
+ INGREDIENT_MAP = load_ingredient_map()
16
+ PARENTS = INGREDIENT_MAP["parents"]
17
+ CHILDREN = INGREDIENT_MAP["children"]
18
+
19
+ def parse_list(x):
20
+ """Convert a stringified list into a Python list safely."""
21
+ if pd.isna(x) or x == "":
22
+ return []
23
+ if isinstance(x, list):
24
+ return x
25
+ try:
26
+ return ast.literal_eval(x)
27
+ except Exception:
28
+ return []
29
+
30
+ def parse_set(x):
31
+ """Convert a stringified collection into a Python set safely."""
32
+ if pd.isna(x) or x == "":
33
+ return set()
34
+ if isinstance(x, set):
35
+ return x
36
+ if isinstance(x, (list, tuple)):
37
+ return set(x)
38
+ if isinstance(x, str):
39
+ try:
40
+ v = ast.literal_eval(x)
41
+ if isinstance(v, (list, tuple, set)):
42
+ return set(v)
43
+ return {v}
44
+ except Exception:
45
+ return {x.strip()}
46
+ return {x}
47
+
48
+ def _parents_pool_from_df(df: pd.DataFrame):
49
+ cols = ["main_parent", "staple_parent", "other_parent", "seasoning_parent"]
50
+ pool = set()
51
+ for c in cols:
52
+ if c in df.columns:
53
+ for s in df[c]:
54
+ pool |= set(s) if isinstance(s, (set, list, tuple)) else set()
55
+ return sorted(pool)
56
+
57
+
58
+ def sample_user_parents(parents_pool,
59
+ user_profile=None,
60
+ prev_inventory=None,
61
+ min_items=3, max_items=10,
62
+ keep_ratio=0.6, reset_interval=20, round_idx=0):
63
+ liked = set((user_profile or {}).get("other_preferences", {}).get("preferred_main", []))
64
+ disliked = set((user_profile or {}).get("other_preferences", {}).get("disliked_main", []))
65
+ forbidden = set((user_profile or {}).get("forbidden_parents", [])) | disliked
66
+
67
+ pool, weights = [], []
68
+ for p in parents_pool:
69
+ if p in forbidden:
70
+ continue
71
+ w = 3.0 if p in liked else 1.0
72
+ pool.append(p); weights.append(w)
73
+ if not pool:
74
+ pool, weights = parents_pool[:], [1.0] * len(parents_pool)
75
+
76
+ inventory = set()
77
+ force_reset = (round_idx % reset_interval == 0)
78
+ if prev_inventory and not force_reset:
79
+ prev_list = list(prev_inventory); random.shuffle(prev_list)
80
+ keep_k = max(0, int(len(prev_list) * keep_ratio))
81
+ inventory |= set(prev_list[:keep_k])
82
+
83
+ k = random.randint(min_items, max_items)
84
+ remain = max(0, k - len(inventory))
85
+ for _ in range(min(remain, len(pool))):
86
+ idx = random.choices(range(len(pool)), weights=weights, k=1)[0]
87
+ inventory.add(pool[idx])
88
+ return list(inventory)
89
+
90
+
91
+ def _weighted_pick3(indexes, scores, temperature=1.0):
92
+ idxs = list(indexes)
93
+ scs = np.array(scores, dtype=float)
94
+ if np.any(scs < 0):
95
+ scs = scs - scs.min()
96
+ if scs.sum() == 0:
97
+ scs = np.ones_like(scs)
98
+ picks = []
99
+ for _ in range(min(3, len(idxs))):
100
+ probs = np.exp(scs / max(temperature, 1e-6))
101
+ probs = probs / probs.sum()
102
+ choice = np.random.choice(len(idxs), p=probs)
103
+ picks.append(idxs[choice])
104
+ idxs.pop(choice)
105
+ scs = np.delete(scs, choice)
106
+ if len(idxs) == 0:
107
+ break
108
+ return picks
109
+
110
+
111
+ # ---------- Main cold-start ----------
112
+ # ---------- Main cold-start ----------
113
+ def cold_start_ranker(user_id: str,
114
+ n_rounds: int = 2000,
115
+ topn_coarse: int = 5000,
116
+ topk_rule: int = 3,
117
+ batch_size: int = 5000,
118
+ switch_interval: int = 100):
119
+ """
120
+ Cold-start data generation for learning-to-rank.
121
+ Top-5 selection prioritizes user pantry coverage deterministically:
122
+ 1. Fully covered recipes first (missing_count == 0)
123
+ 2. Then few missing (esp. staple/other)
124
+ 3. Heavy penalty for missing main ingredients.
125
+ """
126
+
127
+ base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
128
+ if not os.path.exists(base_dir):
129
+ base_dir = os.path.join("recipe_recommendation", "input_user_data", user_id)
130
+
131
+ if not os.path.exists(base_dir):
132
+ raise FileNotFoundError(
133
+ f"❌ User profile not found for '{user_id}' in either 'recipe_recommendation/user_data' or 'recipe_recommendation/input_user_data'."
134
+ )
135
+
136
+ print(f"[cold_start_ranker] Using base_dir = {base_dir}")
137
+
138
+ profile_path = os.path.join(base_dir, "user_profile.json")
139
+ features_path = os.path.join(base_dir, "user_features_rank.csv")
140
+
141
+ if os.path.exists(features_path):
142
+ print(f"[cold_start] Features already exist at {features_path}")
143
+ return features_path
144
+
145
+ with open(profile_path, "r", encoding="utf-8") as f:
146
+ user_profile = json.load(f)
147
+
148
+ # Load and parse recipes
149
+ df_all = pd.read_csv(RECIPES_PATH)
150
+ to_set = ["main_parent", "staple_parent", "other_parent", "seasoning_parent", "cuisine_attr"]
151
+ to_list = ["ingredients"]
152
+ for c in to_set:
153
+ if c in df_all.columns:
154
+ df_all[c] = df_all[c].apply(parse_set)
155
+ for c in to_list:
156
+ if c in df_all.columns:
157
+ df_all[c] = df_all[c].apply(parse_list)
158
+
159
+ # Step 1 hard filter
160
+ if hard_filter is not None:
161
+ try:
162
+ before = len(df_all)
163
+ mask = df_all.apply(lambda r: hard_filter(r.to_dict(), user_profile), axis=1)
164
+ df_all = df_all[mask]
165
+ after = len(df_all)
166
+ print(f"[cold_start] Step1 hard filter applied: {before} -> {after}")
167
+ except Exception as e:
168
+ warnings.warn(f"[cold_start] hard_filter failed, skip. err={e}")
169
+
170
+ n_chunks = (len(df_all) // batch_size) + 1
171
+ chunks = np.array_split(df_all, n_chunks)
172
+ parents_pool = _parents_pool_from_df(df_all)
173
+ rows = []
174
+ prev_inventory = None
175
+
176
+ for i in tqdm(range(n_rounds), desc="Cold-start rounds"):
177
+ chunk_id = (i // switch_interval) % n_chunks
178
+ df_chunk = chunks[chunk_id].copy()
179
+
180
+ # pantry sampling
181
+ user_parents = sample_user_parents(
182
+ parents_pool,
183
+ user_profile=user_profile,
184
+ prev_inventory=prev_inventory,
185
+ round_idx=i
186
+ )
187
+ prev_inventory = user_parents
188
+
189
+ # Step 2: coarse recall
190
+ coarse_list = coarse_rank_candidates(
191
+ recipes=df_chunk.to_dict(orient="records"),
192
+ user_parents=user_parents,
193
+ user_profile=user_profile,
194
+ top_n=min(topn_coarse, len(df_chunk))
195
+ )
196
+ if not coarse_list:
197
+ continue
198
+
199
+ coarse_df = pd.DataFrame(coarse_list)
200
+
201
+ # Step 3: rule rerank → Top-5 candidates (just for selecting the 5)
202
+ rule_df = rule_generate_candidates(
203
+ coarse_df,
204
+ user_parents=user_parents,
205
+ user_profile=user_profile
206
+ )
207
+ if rule_df.empty or len(rule_df) < topk_rule:
208
+ continue
209
+
210
+ top5 = rule_df.head(topk_rule).copy()
211
+
212
+ # ===== Deterministic scoring with feasibility + region + soft constraints =====
213
+ user_set = set(user_parents)
214
+ scored_candidates = []
215
+
216
+ # Nutrition goals (from profile)
217
+ ng = user_profile.get("nutritional_goals", {})
218
+ cal_min = ng.get("calories", {}).get("min", 0)
219
+ cal_max = ng.get("calories", {}).get("max", 1e9)
220
+ pro_min = ng.get("protein", {}).get("min", 0)
221
+ pro_max = ng.get("protein", {}).get("max", 1e9)
222
+
223
+ # Preferences
224
+ liked = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
225
+ disliked = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
226
+ max_cooking_time = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
227
+
228
+ for idx, row in top5.iterrows():
229
+ main_set = set(row.get("main_parent", set()))
230
+ staple_set = set(row.get("staple_parent", set()))
231
+ other_set = set(row.get("other_parent", set()))
232
+
233
+ main_total = len(main_set)
234
+ staple_total = len(staple_set)
235
+ main_match = len(main_set & user_set)
236
+ staple_match = len(staple_set & user_set)
237
+
238
+ # === 1) Feasibility check ===
239
+ total_needed = max(1, main_total + staple_total)
240
+ total_have = main_match + staple_match
241
+ coverage_ratio = total_have / total_needed
242
+
243
+ if coverage_ratio < 0.5:
244
+ continue
245
+
246
+ # === 2) Region preference ===
247
+ region_score = 1.0 if row.get("region_match", 0) else 0.0
248
+
249
+ # === 3) Cooking time soft constraint ===
250
+ time_val = row.get("minutes", None)
251
+ time_score = 0.0
252
+ if max_cooking_time and time_val is not None:
253
+ try:
254
+ t_val = float(time_val)
255
+ t_max = float(max_cooking_time)
256
+ lower_bound = 0.8 * t_max
257
+ upper_bound = 1.2 * t_max
258
+ if lower_bound <= t_val <= upper_bound:
259
+ time_score = 1.0
260
+ else:
261
+ deviation = abs(t_val - t_max) / t_max
262
+ time_score = max(0.0, 1.0 - deviation)
263
+ except (TypeError, ValueError):
264
+ time_score = 0.0
265
+ else:
266
+ time_score = 1.0
267
+
268
+ # === 4) Calories soft constraint ===
269
+ cal_val = row.get("calories", None)
270
+ cal_score = 1.0
271
+ if cal_val is not None and cal_min < cal_max:
272
+ try:
273
+ c_val = float(cal_val)
274
+ cal_center = 0.5 * (cal_min + cal_max)
275
+ tol = 0.3 * cal_center
276
+ lower_bound = cal_center - tol
277
+ upper_bound = cal_center + tol
278
+ if lower_bound <= c_val <= upper_bound:
279
+ cal_score = 1.0
280
+ else:
281
+ deviation = abs(c_val - cal_center) / cal_center
282
+ cal_score = max(0.0, 1.0 - deviation)
283
+ except (TypeError, ValueError):
284
+ cal_score = 0.0
285
+
286
+ # === 4b) Protein soft constraint ===
287
+ protein_val = row.get("protein", None)
288
+ protein_score = 1.0
289
+ if protein_val is not None and pro_min < pro_max:
290
+ try:
291
+ p_val = float(protein_val)
292
+ pro_center = 0.5 * (pro_min + pro_max)
293
+ tol = 0.2 * pro_center
294
+ lower_bound = pro_center - tol
295
+ upper_bound = pro_center + tol
296
+ if lower_bound <= p_val <= upper_bound:
297
+ protein_score = 1.0
298
+ else:
299
+ deviation = abs(p_val - pro_center) / pro_center
300
+ protein_score = max(0.0, 1.0 - deviation)
301
+ except (TypeError, ValueError):
302
+ protein_score = 0.0
303
+
304
+ # === 5) Liked / Disliked main ===
305
+ like_bonus = 1.0 if main_set & liked else 0.0
306
+ dislike_penalty = 1.0 if main_set & disliked else 0.0
307
+
308
+ # === 6) Final scoring ===
309
+ score = (
310
+ 0.5 * coverage_ratio +
311
+ 0.15 * region_score +
312
+ 0.1 * time_score +
313
+ 0.1 * cal_score +
314
+ 0.05 * protein_score +
315
+ 0.05 * like_bonus -
316
+ 0.05 * dislike_penalty
317
+ )
318
+
319
+ scored_candidates.append((idx, score))
320
+
321
+ # Sort and pick top3 for relevance
322
+ scored_candidates.sort(key=lambda x: x[1], reverse=True)
323
+ picked_idxs = [idx for idx, _ in scored_candidates[:3]]
324
+
325
+ # relevance labels 3 / 2 / 1
326
+ labels = {idx: 0 for idx in top5.index}
327
+ if len(picked_idxs) > 0:
328
+ labels[picked_idxs[0]] = 3
329
+ if len(picked_idxs) > 1:
330
+ labels[picked_idxs[1]] = 2
331
+ if len(picked_idxs) > 2:
332
+ labels[picked_idxs[2]] = 1
333
+
334
+ # build features for all 5 candidates
335
+ for idx, row in top5.iterrows():
336
+ up = set(user_parents)
337
+ main_set = set(row.get("main_parent", set()))
338
+ staple_set = set(row.get("staple_parent", set()))
339
+ other_set = set(row.get("other_parent", set()))
340
+
341
+ recipe_dict = {
342
+ "main": main_set,
343
+ "staple": staple_set,
344
+ "other": other_set,
345
+ "seasoning": set(row.get("seasoning_parent", set())),
346
+ "matched_main": len(main_set & up),
347
+ "matched_staple": len(staple_set & up),
348
+ "matched_other": len(other_set & up),
349
+ "calories": row.get("calories", 0),
350
+ "protein": row.get("protein", 0),
351
+ "fat": row.get("fat", 0),
352
+ "region": row.get("region", ""),
353
+ "cuisine_attr": row.get("cuisine_attr", []),
354
+ "ingredients": row.get("ingredients", []),
355
+ "minutes": row.get("minutes", None),
356
+ }
357
+
358
+ feats = build_features(recipe_dict, user_profile)
359
+ feats["relevance"] = float(labels[idx])
360
+ feats["qid"] = int(i)
361
+ rows.append(feats)
362
+
363
+ out = pd.DataFrame(rows)
364
+ if "qid" not in out.columns or out.empty:
365
+ print(f"[cold_start] No valid training data generated for {user_id}, skipping save.")
366
+ return None
367
+
368
+ valid_qids = out.groupby("qid").size()
369
+ keep_qids = valid_qids[valid_qids > 1].index
370
+ out = out[out["qid"].isin(keep_qids)].reset_index(drop=True)
371
+
372
+ os.makedirs(base_dir, exist_ok=True)
373
+ out_path = os.path.join(base_dir, "user_features_rank.csv")
374
+ out.to_csv(out_path, index=False)
375
+ print(f"[cold_start] Saved {len(out)} rows to {out_path}")
376
+ return out_path
377
+
378
+
379
+ if __name__ == "__main__":
380
+ cold_start_ranker(
381
+ user_id="user_1",
382
+ n_rounds=10000,
383
+ topn_coarse=20000,
384
+ topk_rule=5,
385
+ coverage_penalty=0.15,
386
+ temperature=0.5
387
+ )
recipe_recommendation/src/embedding.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ def profile_to_embedding(profile):
7
+ """
8
+ Convert a normalized user profile into a fixed-length numeric embedding.
9
+ Embedding structure:
10
+ [diet (3)] + [allergies (6)] + [region (6)] +
11
+ [nutritional goals (4)] + [preferred_main (8)] + [cooking_time (1)]
12
+ Total dim ≈ 28
13
+ """
14
+ vecs = []
15
+
16
+ # 1. Diet (one-hot)
17
+ diet_types = ["vegetarian", "flexible", "non_vegetarian"]
18
+ diet_vec = np.zeros(len(diet_types))
19
+ diet_value = profile.get("diet", {}).get("vegetarian_type", "flexible")
20
+ if diet_value in diet_types:
21
+ diet_vec[diet_types.index(diet_value)] = 1
22
+ vecs.append(diet_vec)
23
+
24
+ # 2. Allergies (multi-hot)
25
+ allergy_vocab = ["milk", "gluten", "peanut", "shrimp", "egg", "soy"]
26
+ allergies = set(profile.get("allergies", []))
27
+ allergy_vec = np.array([1 if a in allergies else 0 for a in allergy_vocab])
28
+ vecs.append(allergy_vec)
29
+
30
+ # 3. Region preferences (multi-hot)
31
+ region_vocab = ["North America", "Latin America", "Europe", "Asia", "Middle East", "Africa"]
32
+ regions = set(profile.get("region_preference", []))
33
+ region_vec = np.array([1 if r in regions else 0 for r in region_vocab])
34
+ vecs.append(region_vec)
35
+
36
+ # 4. Nutritional goals (normalized)
37
+ ng = profile.get("nutritional_goals", {})
38
+ cal = ng.get("calories", {})
39
+ pro = ng.get("protein", {})
40
+
41
+ cal_min = cal.get("min", 0) / 4000
42
+ cal_max = min(cal.get("max", 9999), 4000) / 4000
43
+ pro_min = pro.get("min", 0) / 300
44
+ pro_max = min(pro.get("max", 999), 300) / 300
45
+
46
+ vecs.append(np.array([cal_min, cal_max, pro_min, pro_max]))
47
+
48
+ # 5. Preferred main ingredients (multi-hot)
49
+ main_vocab = ["chicken", "tofu", "beef", "salmon", "eggs", "pork", "beans", "mushroom"]
50
+ mains = set(profile.get("other_preferences", {}).get("preferred_main", []))
51
+ main_vec = np.array([1 if m in mains else 0 for m in main_vocab])
52
+ vecs.append(main_vec)
53
+
54
+ # 6. Cooking time max (normalized to [0,1], assume 120 min upper bound)
55
+ t = profile.get("other_preferences", {}).get("cooking_time_max")
56
+ t_vec = np.array([min(t / 120, 1)]) if t is not None else np.array([0])
57
+ vecs.append(t_vec)
58
+
59
+ return np.concatenate(vecs)
60
+
61
+
62
+ def profile_similarity(profile_a, profile_b):
63
+ """Compute cosine similarity between two user profiles."""
64
+ emb_a = profile_to_embedding(profile_a).reshape(1, -1)
65
+ emb_b = profile_to_embedding(profile_b).reshape(1, -1)
66
+ return cosine_similarity(emb_a, emb_b)[0, 0]
67
+
68
+ def find_most_similar_user(target_user_id, user_data_dir="recipe_recommendation/user_data", threshold=0.85):
69
+ """
70
+ Find the most similar existing user based on profile embeddings.
71
+ Returns (best_match_user_id, similarity_score) or (None, -1) if no match.
72
+ """
73
+ target_profile_path = os.path.join(user_data_dir, target_user_id, "user_profile.json")
74
+ if not os.path.exists(target_profile_path):
75
+ raise FileNotFoundError(f"[embedding] No profile found for user {target_user_id}")
76
+
77
+ with open(target_profile_path, "r", encoding="utf-8") as f:
78
+ target_profile = json.load(f)
79
+ target_emb = profile_to_embedding(target_profile).reshape(1, -1)
80
+
81
+ best_match, best_score = None, -1
82
+
83
+ for uid in os.listdir(user_data_dir):
84
+ if uid == target_user_id:
85
+ continue
86
+ profile_path = os.path.join(user_data_dir, uid, "user_profile.json")
87
+ if not os.path.exists(profile_path):
88
+ continue
89
+ with open(profile_path, "r", encoding="utf-8") as f:
90
+ other_profile = json.load(f)
91
+ other_emb = profile_to_embedding(other_profile).reshape(1, -1)
92
+ sim = cosine_similarity(target_emb, other_emb)[0, 0]
93
+ if sim > best_score:
94
+ best_match, best_score = uid, sim
95
+
96
+ if best_match and best_score >= threshold:
97
+ print(f"[embedding] Found similar user: {best_match} (similarity={best_score:.3f})")
98
+ return best_match, best_score
99
+
100
+ return None, -1
recipe_recommendation/src/feature.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from .io import load_ingredient_map
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ # Load ingredient map globally to avoid repeated I/O
7
+ INGREDIENT_MAP = load_ingredient_map()
8
+ PARENTS = INGREDIENT_MAP["parents"]
9
+ CHILDREN = INGREDIENT_MAP["children"]
10
+
11
+ FEATURE_COLS = [
12
+ "main_match_ratio", "other_match_ratio", "staple_match_ratio",
13
+ "missing_main_count", "missing_other_count", "missing_staple_count",
14
+ "calories", "protein", "fat", "protein_ratio", "fat_ratio",
15
+ "region_match",
16
+ "is_vegan_safe", "is_vegetarian_safe_absolute", "is_flexible_safe_absolute", "is_user_diet_safe",
17
+ "preferred_main_overlap", "disliked_main_overlap",
18
+ "preferred_course_overlap",
19
+ "within_cooking_time", "cooking_time_over",
20
+ "calories_value", "calories_deviation",
21
+ "protein_value", "protein_deviation",
22
+ ]
23
+
24
+
25
+ def extract_features(recipes: list[dict], user_profile: dict):
26
+ """
27
+ Convert a list of recipes into a feature matrix for ML model.
28
+ Ensures feature columns are aligned with FEATURE_COLS.
29
+ """
30
+ rows = [build_features(r, user_profile) for r in recipes]
31
+ df = pd.DataFrame(rows)
32
+ df = df.reindex(columns=FEATURE_COLS, fill_value=0)
33
+ assert list(df.columns) == FEATURE_COLS, "Feature columns mismatch!"
34
+ return df
35
+
36
+ def is_recipe_vegetarian_safe(ingredients: list[str], veg_type: str) -> bool:
37
+ """
38
+ Check if the recipe is safe for a given dietary type.
39
+ Supported veg_type: "vegan", "vegetarian", "flexible_vegetarian", "" (none).
40
+ """
41
+ for ing in ingredients:
42
+ ing_lower = ing.strip().lower()
43
+ if ing_lower in CHILDREN:
44
+ info = CHILDREN[ing_lower]
45
+ elif ing_lower in PARENTS:
46
+ info = PARENTS[ing_lower]
47
+ else:
48
+ # If the ingredient is not found in the map, treat it as safe by default.
49
+ continue
50
+
51
+ if veg_type == "vegan" and not info.get("vegan_safe", True):
52
+ return False
53
+ if veg_type == "vegetarian" and not info.get("vegetarian_safe", True):
54
+ return False
55
+ if veg_type == "flexible_vegetarian":
56
+ # Flexible vegetarians allow most ingredients except explicit meat.
57
+ # Here, we can use vegetarian_safe as a proxy for flexibility.
58
+ if not info.get("vegetarian_safe", True):
59
+ return False
60
+ return True
61
+
62
+
63
+ def build_features(recipe: dict, user_profile: dict) -> dict:
64
+ """
65
+ Build a feature dictionary for ML ranker and rule-based scoring.
66
+ All features are numeric scalars or counts.
67
+ """
68
+ features = {}
69
+
70
+ # ======================================================
71
+ # 1. Ingredient matching ratios
72
+ # ======================================================
73
+ total_main = len(recipe.get("main", []))
74
+ total_other = len(recipe.get("other", []))
75
+ total_staple = len(recipe.get("staple", []))
76
+
77
+ features["main_match_ratio"] = recipe.get("matched_main", 0) / max(total_main, 1)
78
+ features["other_match_ratio"] = recipe.get("matched_other", 0) / max(total_other, 1)
79
+ features["staple_match_ratio"] = recipe.get("matched_staple", 0) / max(total_staple, 1)
80
+
81
+ features["missing_main_count"] = total_main - recipe.get("matched_main", 0)
82
+ features["missing_other_count"] = total_other - recipe.get("matched_other", 0)
83
+ features["missing_staple_count"] = total_staple - recipe.get("matched_staple", 0)
84
+
85
+ # ======================================================
86
+ # 2. Basic nutrition info
87
+ # ======================================================
88
+ calories = recipe.get("calories", 0.0) or 0.0
89
+ protein = recipe.get("protein", 0.0) or 0.0
90
+ fat = recipe.get("fat", 0.0) or 0.0
91
+
92
+ features["calories"] = calories
93
+ features["protein"] = protein
94
+ features["fat"] = fat
95
+ features["protein_ratio"] = protein / max(calories, 1)
96
+ features["fat_ratio"] = fat / max(calories, 1)
97
+
98
+ # ======================================================
99
+ # 3. Region preference
100
+ # ======================================================
101
+ recipe_region = recipe.get("region", [])
102
+ if recipe_region is None or recipe_region == "" or (isinstance(recipe_region, float) and np.isnan(recipe_region)):
103
+ recipe_regions = []
104
+ elif isinstance(recipe_region, (set, list, tuple)):
105
+ recipe_regions = list(recipe_region)
106
+ else:
107
+ recipe_regions = [recipe_region]
108
+
109
+ user_regions = user_profile.get("region_preference", [])
110
+ if isinstance(user_regions, str):
111
+ user_regions = [user_regions]
112
+
113
+ recipe_regions_norm = {str(r).strip().lower() for r in recipe_regions if r}
114
+ user_regions_norm = {str(r).strip().lower() for r in user_regions if r}
115
+
116
+ features["region_match"] = int(len(recipe_regions_norm & user_regions_norm) > 0)
117
+
118
+ # ======================================================
119
+ # 4. Diet constraints
120
+ # ======================================================
121
+ ingredients_all = recipe.get("ingredients", [])
122
+
123
+ features["is_vegan_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegan"))
124
+ features["is_vegetarian_safe_absolute"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegetarian"))
125
+ features["is_flexible_safe_absolute"] = int(is_recipe_vegetarian_safe(ingredients_all, "flexible_vegetarian"))
126
+
127
+ veg_type = (user_profile.get("diet", {}).get("vegetarian_type", "") or "").lower()
128
+ features["is_user_diet_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, veg_type))
129
+
130
+ # ======================================================
131
+ # 5. Preferred & disliked main
132
+ # ======================================================
133
+ recipe_main = set(recipe.get("main", []))
134
+ preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
135
+ disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
136
+
137
+ features["preferred_main_overlap"] = 1.0 if recipe_main & preferred_main else 0.0
138
+ features["disliked_main_overlap"] = 1.0 if recipe_main & disliked_main else 0.0
139
+
140
+ # ======================================================
141
+ # 6. Course type preference
142
+ # ======================================================
143
+ recipe_types = set(recipe.get("cuisine_attr", []))
144
+ preferred_types = set(user_profile.get("preferred_course_types", []))
145
+ features["preferred_course_overlap"] = len(recipe_types & preferred_types)
146
+
147
+ # ======================================================
148
+ # 7. Cooking time features
149
+ # ======================================================
150
+ max_time = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
151
+ recipe_time = recipe.get("minutes", None)
152
+
153
+ if max_time is not None and recipe_time is not None:
154
+ try:
155
+ recipe_time_val = float(recipe_time)
156
+ max_time_val = float(max_time)
157
+ features["within_cooking_time"] = 1.0 if recipe_time_val <= max_time_val else 0.0
158
+ features["cooking_time_over"] = max(0.0, recipe_time_val - max_time_val)
159
+ except (TypeError, ValueError):
160
+ features["within_cooking_time"] = 0.0
161
+ features["cooking_time_over"] = 0.0
162
+ else:
163
+ features["within_cooking_time"] = 1.0
164
+ features["cooking_time_over"] = 0.0
165
+
166
+ # ======================================================
167
+ # 8. Calories / Protein deviation features
168
+ # ======================================================
169
+ ng = user_profile.get("nutritional_goals", {})
170
+ cal_min = ng.get("calories", {}).get("min", 0)
171
+ cal_max = ng.get("calories", {}).get("max", 1e9)
172
+ pro_min = ng.get("protein", {}).get("min", 0)
173
+ pro_max = ng.get("protein", {}).get("max", 1e9)
174
+
175
+ # --- Calories deviation ---
176
+ if calories is not None and cal_min < cal_max:
177
+ try:
178
+ cal_center = 0.5 * (cal_min + cal_max)
179
+ features["calories_value"] = float(calories)
180
+ features["calories_deviation"] = (float(calories) - cal_center) / cal_center
181
+ except (TypeError, ValueError):
182
+ features["calories_value"] = 0.0
183
+ features["calories_deviation"] = 0.0
184
+ else:
185
+ features["calories_value"] = 0.0
186
+ features["calories_deviation"] = 0.0
187
+
188
+ # --- Protein deviation ---
189
+ if protein is not None and pro_min < pro_max:
190
+ try:
191
+ pro_center = 0.5 * (pro_min + pro_max)
192
+ features["protein_value"] = float(protein)
193
+ features["protein_deviation"] = (float(protein) - pro_center) / pro_center
194
+ except (TypeError, ValueError):
195
+ features["protein_value"] = 0.0
196
+ features["protein_deviation"] = 0.0
197
+ else:
198
+ features["protein_value"] = 0.0
199
+ features["protein_deviation"] = 0.0
200
+
201
+ return features
202
+
203
+
204
+ def build_cluster_features(candidates):
205
+ """
206
+ Build simple ingredient + cuisine based feature vectors for KMeans clustering.
207
+ This is separate from model training features.
208
+
209
+ Args:
210
+ candidates (list[dict]): list of recipe dicts.
211
+
212
+ Returns:
213
+ np.ndarray: feature matrix (num_candidates, num_features)
214
+ """
215
+ # 1. Collect vocabulary for ingredients and cuisine
216
+ all_main = set()
217
+ all_staple = set()
218
+ all_other = set()
219
+ all_cuisine = set()
220
+
221
+ for r in candidates:
222
+ all_main.update(r.get("main_parent", []) or [])
223
+ all_staple.update(r.get("staple_parent", []) or [])
224
+ all_other.update(r.get("other_parent", []) or [])
225
+ all_cuisine.update(r.get("cuisine_attr", []) or [])
226
+
227
+ main_vocab = sorted(all_main)
228
+ staple_vocab = sorted(all_staple)
229
+ other_vocab = sorted(all_other)
230
+ cuisine_vocab = sorted(all_cuisine)
231
+
232
+ # 2. Build index map
233
+ main_idx = {p: i for i, p in enumerate(main_vocab)}
234
+ staple_idx = {p: i + len(main_vocab) for i, p in enumerate(staple_vocab)}
235
+ other_idx = {p: i + len(main_vocab) + len(staple_vocab) for i, p in enumerate(other_vocab)}
236
+ cuisine_idx = {p: i + len(main_vocab) + len(staple_vocab) + len(other_vocab)
237
+ for i, p in enumerate(cuisine_vocab)}
238
+
239
+ dim = len(main_vocab) + len(staple_vocab) + len(other_vocab) + len(cuisine_vocab)
240
+ X = np.zeros((len(candidates), dim), dtype=np.uint8)
241
+
242
+ # 3. Fill feature matrix
243
+ for i, r in enumerate(candidates):
244
+ for p in r.get("main_parent", []) or []:
245
+ if p in main_idx:
246
+ X[i, main_idx[p]] = 1
247
+ for p in r.get("staple_parent", []) or []:
248
+ if p in staple_idx:
249
+ X[i, staple_idx[p]] = 1
250
+ for p in r.get("other_parent", []) or []:
251
+ if p in other_idx:
252
+ X[i, other_idx[p]] = 1
253
+ for p in r.get("cuisine_attr", []) or []:
254
+ if p in cuisine_idx:
255
+ X[i, cuisine_idx[p]] = 1
256
+
257
+ return X
recipe_recommendation/src/highlight.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.cluster import KMeans
3
+ from sklearn.preprocessing import StandardScaler
4
+ import numpy as np
5
+
6
+
7
+ def print_candidates(candidates, user_parents, topk=10):
8
+ shown = 0
9
+ max_score = candidates['match_score'].max()
10
+ min_score = candidates['match_score'].min()
11
+
12
+ for _, row in candidates.head(topk).iterrows():
13
+ scaled_score = 100 * row['match_score'] / (max_score + 1e-9)
14
+ print(f"{row['name']} (score {scaled_score:.1f}%)")
15
+
16
+ # ----- Region -----
17
+ region = row.get("region", None)
18
+ if pd.notna(region) and isinstance(region, str) and region.strip() and region.lower() != "unavailable":
19
+ print(f" region: {region}")
20
+
21
+ # ----- Cuisine Attributes -----
22
+ cuisine = row.get("cuisine_attr", None)
23
+ if cuisine is not None and not (isinstance(cuisine, float) and pd.isna(cuisine)):
24
+ # Convert set to list for printing
25
+ if isinstance(cuisine, set):
26
+ cuisine = list(cuisine)
27
+ elif isinstance(cuisine, str):
28
+ cuisine = [cuisine]
29
+
30
+ if isinstance(cuisine, list) and len(cuisine) > 0:
31
+ print(f" cuisine: {', '.join(cuisine)}")
32
+
33
+ # ----- Nutrition -----
34
+ print(f" calories: {row.get('calories', 'N/A')}")
35
+
36
+ # ----- Ingredient Marking -----
37
+ def mark_list(lst):
38
+ return [("✅ " + ing) if ing in user_parents else ("❌ " + ing) for ing in lst]
39
+
40
+ print(f" staple: {mark_list(row.get('staple_parent', []))}")
41
+ print(f" main: {mark_list(row.get('main_parent', []))}")
42
+ print(f" seasoning: {row.get('seasoning_parent', [])}")
43
+ print(f" other: {mark_list(row.get('other_parent', []))}")
44
+ print("-" * 40)
45
+
46
+ shown += 1
47
+
48
+ def diversify_topk_with_min_clusters(
49
+ ranked_candidates,
50
+ feature_matrix,
51
+ top_k=5,
52
+ n_clusters=20,
53
+ min_clusters=3,
54
+ random_state=42
55
+ ):
56
+ """
57
+ Diversify top-k displayed recipes using KMeans clustering.
58
+ Ensures that the final top_k contains at least `min_clusters` distinct clusters.
59
+ """
60
+ if len(ranked_candidates) == 0:
61
+ return []
62
+
63
+ n_clusters = min(n_clusters, len(ranked_candidates))
64
+ scaler = StandardScaler()
65
+ X_scaled = scaler.fit_transform(feature_matrix)
66
+
67
+ # KMeans clustering
68
+ kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=random_state)
69
+ cluster_ids = kmeans.fit_predict(X_scaled)
70
+
71
+ # Step 1: pick candidates from distinct clusters until min_clusters reached
72
+ picked = []
73
+ picked_clusters = set()
74
+ for i, c in enumerate(cluster_ids):
75
+ if c not in picked_clusters:
76
+ picked.append(ranked_candidates[i])
77
+ picked_clusters.add(c)
78
+ if len(picked_clusters) >= min_clusters or len(picked) >= top_k:
79
+ break
80
+
81
+ # Step 2: fill the rest purely by rank order
82
+ if len(picked) < top_k:
83
+ for i, c in enumerate(cluster_ids):
84
+ if ranked_candidates[i] not in picked:
85
+ picked.append(ranked_candidates[i])
86
+ if len(picked) >= top_k:
87
+ break
88
+
89
+ return picked
90
+
91
+
recipe_recommendation/src/io.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ # Hugging Face ID
6
+ REPO_ID = "Iris314/recipe-cleaned"
7
+
8
+ ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
9
+ DATA_DIR = os.path.join(ROOT_DIR, "data")
10
+ os.makedirs(DATA_DIR, exist_ok=True)
11
+
12
+
13
+ def download_file(filename: str) -> str:
14
+
15
+ local_path = os.path.join(DATA_DIR, filename)
16
+ if not os.path.exists(local_path):
17
+ print(f"Downloading {filename} from Hugging Face Hub...")
18
+ hf_hub_download(
19
+ repo_id=REPO_ID,
20
+ filename=filename,
21
+ repo_type="dataset",
22
+ local_dir=DATA_DIR,
23
+ local_dir_use_symlinks=False
24
+ )
25
+ else:
26
+ print(f"{filename} already exists locally.")
27
+ return local_path
28
+
29
+
30
+ def load_recipes_csv() -> str:
31
+ return download_file("recipes.csv")
32
+
33
+
34
+ def load_ingredient_map() -> dict:
35
+ path = download_file("ingredient_map.data")
36
+ with open(path, "r", encoding="utf-8") as f:
37
+ return json.load(f)
recipe_recommendation/src/trainmodel.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import joblib
3
+ import warnings
4
+ import numpy as np
5
+ import pandas as pd
6
+ from typing import List, Tuple, Sequence, Optional
7
+ from xgboost import XGBRanker
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import ndcg_score
10
+ from pandas.api.types import is_numeric_dtype
11
+ from .feature import FEATURE_COLS
12
+ from datetime import datetime
13
+
14
+
15
+
16
+ # ----------------------------- Helpers -----------------------------
17
+ def _pick_feature_cols(df: pd.DataFrame, drop_cols: Sequence[str]) -> List[str]:
18
+ """
19
+ Pick numeric feature columns robustly, excluding drop_cols.
20
+ Uses pandas is_numeric_dtype to correctly include nullable ints/floats/bools.
21
+ """
22
+ cols = []
23
+ for c in df.columns:
24
+ if c in drop_cols:
25
+ continue
26
+ if is_numeric_dtype(df[c]):
27
+ cols.append(c)
28
+ return cols
29
+
30
+
31
+ def _sort_and_pack_by_qid(
32
+ X: pd.DataFrame, y: pd.Series, qid: pd.Series, feature_cols: List[str]
33
+ ) -> Tuple[pd.DataFrame, np.ndarray, List[int], np.ndarray]:
34
+ """
35
+ Sort rows by qid so that group sizes match the sample order.
36
+ Returns:
37
+ X_sorted, y_sorted, groups, qid_sorted (aligned with X_sorted/y_sorted)
38
+ """
39
+ packed = X.copy()
40
+ packed["_label"] = y.values
41
+ packed["_qid"] = qid.values
42
+ packed = packed.sort_values("_qid").reset_index(drop=True)
43
+
44
+ groups = packed.groupby("_qid").size().tolist()
45
+ X_sorted = packed[feature_cols].copy()
46
+ y_sorted = packed["_label"].astype(float).values
47
+ qid_sorted = packed["_qid"].values
48
+ return X_sorted, y_sorted, groups, qid_sorted
49
+
50
+
51
+ def _eval_mean_ndcg(
52
+ model: XGBRanker,
53
+ X_val: pd.DataFrame,
54
+ y_val, # can be np.ndarray or pd.Series
55
+ qid_val, # aligned with X_val/y_val
56
+ ks: Sequence[int] = (5, 10),
57
+ ) -> dict:
58
+ """
59
+ Compute mean NDCG@k for each k in ks over validation queries.
60
+ Accepts numpy arrays or pandas Series.
61
+ """
62
+ # Try to respect early-stopping best iteration if available (xgboost>=2.0)
63
+ try:
64
+ preds = model.predict(X_val, iteration_range=(0, model.best_iteration + 1))
65
+ except Exception:
66
+ preds = model.predict(X_val)
67
+
68
+ y_arr = np.asarray(y_val)
69
+ q_arr = np.asarray(qid_val)
70
+
71
+ out = {}
72
+ for k in ks:
73
+ ndcgs = []
74
+ for q in np.unique(q_arr):
75
+ mask = (q_arr == q)
76
+ if mask.sum() < 2:
77
+ continue
78
+ ndcgs.append(ndcg_score([y_arr[mask]], [preds[mask]], k=k))
79
+ out[f"NDCG@{k}"] = float(np.mean(ndcgs)) if ndcgs else 0.0
80
+ return out
81
+
82
+
83
+
84
+ # ----------------------------- Main Trainer -----------------------------
85
+ def train_model_ranker(
86
+ user_id: str = "user_1",
87
+ features_path: Optional[str] = None,
88
+ save_model: bool = True,
89
+ model_params: Optional[dict] = None,
90
+ val_ratio: float = 0.2,
91
+ random_state: int = 42,
92
+ max_rows: Optional[int] = None,
93
+ ):
94
+ """
95
+ Train an XGBoost Learning-to-Rank model (XGBRanker) on cold-start generated data.
96
+
97
+ Expected input CSV (from cold_start.py):
98
+ - qid: query id (one round of pantry sampling = one query)
99
+ - relevance: graded relevance label (e.g., 3/2/1/0)
100
+ - features: numeric columns produced by build_features (and any extra numeric signals)
101
+
102
+ The function:
103
+ 1) Reads the CSV
104
+ 2) Selects numeric feature columns robustly
105
+ 3) Splits train/val by qid to avoid leakage
106
+ 4) Sorts each split by qid and builds group sizes aligned to sample order
107
+ 5) Trains XGBRanker and reports NDCG@5/10
108
+ 6) Saves model to user_data/<user_id>/ranker.pkl
109
+ """
110
+ base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
111
+ os.makedirs(base_dir, exist_ok=True)
112
+
113
+
114
+ # Resolve features path
115
+ if features_path is None:
116
+ features_path = os.path.join(base_dir, "user_features_rank.csv")
117
+ if not os.path.exists(features_path):
118
+ raise FileNotFoundError(
119
+ f"[train_model_ranker] Cold-start features not found at: {features_path}\n"
120
+ f"Please run cold_start_ranker(user_id='{user_id}') first."
121
+ )
122
+
123
+ # Load data
124
+ df = pd.read_csv(features_path)
125
+ if max_rows is not None and len(df) > max_rows:
126
+ df = df.sample(max_rows, random_state=random_state).reset_index(drop=True)
127
+
128
+ # Basic validation
129
+ if "qid" not in df.columns or "relevance" not in df.columns:
130
+ raise ValueError("Input CSV must contain 'qid' and 'relevance' columns.")
131
+
132
+ # Fill NaNs in label/qid (should not happen, but defensive)
133
+ df["qid"] = pd.to_numeric(df["qid"], errors="coerce").fillna(-1).astype(int)
134
+ df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(float)
135
+
136
+ # Pick numeric feature columns robustly
137
+ feature_cols = FEATURE_COLS.copy()
138
+ df = df.reindex(columns=["qid", "relevance"] + feature_cols, fill_value=0)
139
+
140
+ # Ensure numeric + finite values only (replace inf/nan with 0)
141
+ df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")
142
+ df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
143
+
144
+ # Split by qid to avoid leakage across queries
145
+ unique_qids = df["qid"].unique()
146
+ if len(unique_qids) < 2:
147
+ warnings.warn("Only one unique qid found — ranking training may be ineffective.")
148
+ train_mask = np.ones(len(df), dtype=bool)
149
+ val_mask = np.zeros(len(df), dtype=bool)
150
+ else:
151
+ train_qids, val_qids = train_test_split(
152
+ unique_qids, test_size=val_ratio, random_state=random_state
153
+ )
154
+ train_mask = df["qid"].isin(train_qids)
155
+ val_mask = df["qid"].isin(val_qids)
156
+
157
+ # Split dataframes AFTER defining masks
158
+ X_train_raw = df.loc[train_mask, feature_cols]
159
+ y_train_raw = df.loc[train_mask, "relevance"]
160
+ qid_train = df.loc[train_mask, "qid"]
161
+
162
+ X_val_raw = df.loc[val_mask, feature_cols]
163
+ y_val_raw = df.loc[val_mask, "relevance"]
164
+ qid_val = df.loc[val_mask, "qid"]
165
+
166
+ # Sort by qid and build group sizes aligned with sample order (CRITICAL for XGBRanker)
167
+ X_train, y_train, group_train, _ = _sort_and_pack_by_qid(
168
+ X_train_raw, y_train_raw, qid_train, feature_cols
169
+ )
170
+ X_val, y_val, group_val, qid_val_sorted = _sort_and_pack_by_qid(
171
+ X_val_raw, y_val_raw, qid_val, feature_cols
172
+ )
173
+
174
+
175
+ print(f"[ranker] #Train groups: {len(group_train)} | #Val groups: {len(group_val)}")
176
+ print(f"[ranker] Train rows: {len(X_train)} | Val rows: {len(X_val)} | #Features: {len(feature_cols)}")
177
+
178
+ # Default model params
179
+ default_params = dict(
180
+ objective="rank:ndcg",
181
+ eval_metric="ndcg",
182
+ n_estimators=400,
183
+ learning_rate=0.08,
184
+ max_depth=6,
185
+ subsample=0.8,
186
+ colsample_bytree=0.8,
187
+ random_state=random_state,
188
+ tree_method="hist",
189
+ reg_lambda=1.0,
190
+ reg_alpha=0.0,
191
+ )
192
+ if model_params:
193
+ default_params.update(model_params)
194
+
195
+ model = XGBRanker(**default_params)
196
+
197
+ # Fit model (XGBRanker requires group/group for eval_set as well)
198
+ fit_kwargs = dict(
199
+ X=X_train,
200
+ y=y_train,
201
+ group=group_train,
202
+ eval_set=[(X_val, y_val)],
203
+ eval_group=[group_val],
204
+ verbose=False,
205
+ )
206
+
207
+ try:
208
+ # Newer xgboost versions (some builds) support early_stopping_rounds on Ranker
209
+ model.fit(early_stopping_rounds=50, **fit_kwargs) # maximize=True is inferred by 'ndcg'
210
+ except TypeError:
211
+ # Fallback to callback API (older versions)
212
+ try:
213
+ from xgboost.callback import EarlyStopping
214
+ model.fit(callbacks=[EarlyStopping(rounds=50, save_best=True, maximize=True)], **fit_kwargs)
215
+ except Exception:
216
+ # Last resort: train without early stopping
217
+ model.fit(**fit_kwargs)
218
+
219
+ # Evaluate mean NDCG@5/10
220
+ metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
221
+
222
+ print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
223
+
224
+ # Evaluate mean NDCG@5/10
225
+ metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
226
+ print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
227
+
228
+ # === Save NDCG metrics to log ===
229
+ from datetime import datetime
230
+ log_path = os.path.join(base_dir, "training_log.txt")
231
+ with open(log_path, "a", encoding="utf-8") as f:
232
+ ndcg5 = metrics.get("NDCG@5", 0.0)
233
+ ndcg10 = metrics.get("NDCG@10", 0.0)
234
+ f.write(f"{datetime.now().isoformat()} | NDCG@5={ndcg5:.4f}, NDCG@10={ndcg10:.4f}\n")
235
+ print(f"[ranker] Logged metrics to {log_path}")
236
+
237
+ # Save model
238
+ model_path = os.path.join(base_dir, "ranker.pkl")
239
+ joblib.dump(model, model_path)
240
+ print(f"[ranker] Model saved to {model_path}")
241
+
242
+ # Save model
243
+ if save_model:
244
+ model_path = os.path.join(base_dir, "ranker.pkl")
245
+ joblib.dump(model, model_path)
246
+ print(f"[ranker] Model saved to {model_path}")
247
+
248
+ return model, metrics, feature_cols
249
+
250
+
251
+
252
+
253
+ if __name__ == "__main__":
254
+ # Example run
255
+ train_model_ranker(
256
+ user_id="user_1",
257
+ save_model=True,
258
+ val_ratio=0.2,
259
+ random_state=42,
260
+ max_rows=None, # or set an upper bound for quick iterations, e.g., 200_000
261
+ model_params=None, # override defaults if desired
262
+ )