Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- recipe_recommendation/src/__init__.py +0 -0
- recipe_recommendation/src/candidate.py +346 -0
- recipe_recommendation/src/coldstart.py +387 -0
- recipe_recommendation/src/embedding.py +100 -0
- recipe_recommendation/src/feature.py +257 -0
- recipe_recommendation/src/highlight.py +91 -0
- recipe_recommendation/src/io.py +37 -0
- recipe_recommendation/src/trainmodel.py +262 -0
recipe_recommendation/src/__init__.py
ADDED
|
File without changes
|
recipe_recommendation/src/candidate.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from .feature import extract_features
|
| 4 |
+
from .io import load_ingredient_map
|
| 5 |
+
import joblib
|
| 6 |
+
|
| 7 |
+
# Load ingredient map globally to avoid repeated I/O
|
| 8 |
+
INGREDIENT_MAP = load_ingredient_map()
|
| 9 |
+
PARENTS = INGREDIENT_MAP["parents"]
|
| 10 |
+
CHILDREN = INGREDIENT_MAP["children"]
|
| 11 |
+
|
| 12 |
+
def extract_user_parents(user_ingredients):
|
| 13 |
+
"""Map user's ingredients to parent categories"""
|
| 14 |
+
user_parents = set()
|
| 15 |
+
for ing in user_ingredients:
|
| 16 |
+
ing_lower = ing.lower().strip()
|
| 17 |
+
if ing_lower in CHILDREN:
|
| 18 |
+
parent = CHILDREN[ing_lower]["parent"]
|
| 19 |
+
user_parents.add(parent)
|
| 20 |
+
elif ing_lower in PARENTS:
|
| 21 |
+
user_parents.add(ing_lower)
|
| 22 |
+
return user_parents
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# def hard_filter(recipe, user_profile):
|
| 26 |
+
# diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
|
| 27 |
+
# if diet == "vegan" and not recipe.get("is_vegan_safe", True):
|
| 28 |
+
# return False
|
| 29 |
+
# if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
|
| 30 |
+
# return False
|
| 31 |
+
# return True
|
| 32 |
+
|
| 33 |
+
def hard_filter(recipe: dict, user_profile: dict, debug=False) -> bool:
|
| 34 |
+
"""
|
| 35 |
+
Minimal hard filter: only vegan/vegetarian & disliked main.
|
| 36 |
+
"""
|
| 37 |
+
recipe_name = recipe.get("name", "Unknown")
|
| 38 |
+
|
| 39 |
+
# --- Dietary filter ---
|
| 40 |
+
diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
|
| 41 |
+
if diet == "vegan" and not recipe.get("is_vegan_safe", True):
|
| 42 |
+
if debug:
|
| 43 |
+
print(f"❌ {recipe_name}: Not vegan-safe")
|
| 44 |
+
return False
|
| 45 |
+
if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
|
| 46 |
+
if debug:
|
| 47 |
+
print(f"❌ {recipe_name}: Not vegetarian-safe")
|
| 48 |
+
return False
|
| 49 |
+
|
| 50 |
+
# --- Disliked main ingredients filter ---
|
| 51 |
+
disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
|
| 52 |
+
if disliked_main:
|
| 53 |
+
recipe_main = recipe.get("main_parent", set())
|
| 54 |
+
if isinstance(recipe_main, list):
|
| 55 |
+
recipe_main = set(recipe_main)
|
| 56 |
+
elif not isinstance(recipe_main, set):
|
| 57 |
+
recipe_main = set()
|
| 58 |
+
|
| 59 |
+
overlap = recipe_main & disliked_main
|
| 60 |
+
if overlap:
|
| 61 |
+
if debug:
|
| 62 |
+
print(f"❌ {recipe_name}: Contains disliked {overlap}")
|
| 63 |
+
return False
|
| 64 |
+
|
| 65 |
+
if debug:
|
| 66 |
+
print(f"✅ {recipe_name}: PASS hard filter")
|
| 67 |
+
|
| 68 |
+
return True
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
COARSE_WEIGHTS = {
|
| 73 |
+
"main_match_ratio": 1.0,
|
| 74 |
+
"staple_match_ratio": 0.3,
|
| 75 |
+
"other_match_ratio": 0.6,
|
| 76 |
+
"low_calorie_penalty": 0.2,
|
| 77 |
+
"preferred_course_overlap": 0.1,
|
| 78 |
+
"region_match": 0.8,
|
| 79 |
+
"preferred_main_overlap": 1
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def coarse_score(features, weights=COARSE_WEIGHTS):
|
| 84 |
+
score = 0.0
|
| 85 |
+
for key, w in weights.items():
|
| 86 |
+
if key in features:
|
| 87 |
+
score += w * features[key]
|
| 88 |
+
return score
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def coarse_rank_candidates(recipes, user_parents, user_profile, top_n=30000, weights=COARSE_WEIGHTS):
|
| 92 |
+
"""
|
| 93 |
+
Stage 2: Coarse Ranking (NumPy vectorized implementation)
|
| 94 |
+
---------------------------------------------------------
|
| 95 |
+
Quickly retrieves a subset of candidate recipes by computing
|
| 96 |
+
ingredient coverage ratios (main / staple / other) between
|
| 97 |
+
the user's pantry and the recipes using vectorized operations.
|
| 98 |
+
|
| 99 |
+
This function replaces the original Python loop version
|
| 100 |
+
for significant speedup during cold start and real-time ranking.
|
| 101 |
+
"""
|
| 102 |
+
if not recipes:
|
| 103 |
+
return []
|
| 104 |
+
|
| 105 |
+
# === 1. Build parent vocabulary ===
|
| 106 |
+
# Extract all unique parent ingredients across main/staple/other fields.
|
| 107 |
+
all_parents = sorted({
|
| 108 |
+
p for r in recipes
|
| 109 |
+
for k in ["main_parent", "staple_parent", "other_parent"]
|
| 110 |
+
for p in (r.get(k) or [])
|
| 111 |
+
})
|
| 112 |
+
parent_index = {p: i for i, p in enumerate(all_parents)}
|
| 113 |
+
num_recipes = len(recipes)
|
| 114 |
+
num_parents = len(all_parents)
|
| 115 |
+
|
| 116 |
+
# === 2. Construct multi-hot matrices for main, staple, other ===
|
| 117 |
+
# Each row corresponds to a recipe; each column to a parent ingredient.
|
| 118 |
+
main_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
|
| 119 |
+
staple_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
|
| 120 |
+
other_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
|
| 121 |
+
|
| 122 |
+
for i, r in enumerate(recipes):
|
| 123 |
+
for p in r.get("main_parent", []):
|
| 124 |
+
if p in parent_index:
|
| 125 |
+
main_mat[i, parent_index[p]] = 1
|
| 126 |
+
for p in r.get("staple_parent", []):
|
| 127 |
+
if p in parent_index:
|
| 128 |
+
staple_mat[i, parent_index[p]] = 1
|
| 129 |
+
for p in r.get("other_parent", []):
|
| 130 |
+
if p in parent_index:
|
| 131 |
+
other_mat[i, parent_index[p]] = 1
|
| 132 |
+
|
| 133 |
+
# === 3. Encode user pantry as a binary mask ===
|
| 134 |
+
user_mask = np.zeros(num_parents, dtype=np.uint8)
|
| 135 |
+
for p in user_parents:
|
| 136 |
+
if p in parent_index:
|
| 137 |
+
user_mask[parent_index[p]] = 1
|
| 138 |
+
|
| 139 |
+
# === 4. Compute ingredient match ratios in batch ===
|
| 140 |
+
# main_ratio = (# of matched main ingredients) / (# of total main ingredients)
|
| 141 |
+
main_total = main_mat.sum(axis=1)
|
| 142 |
+
staple_total = staple_mat.sum(axis=1)
|
| 143 |
+
other_total = other_mat.sum(axis=1)
|
| 144 |
+
|
| 145 |
+
main_match = (main_mat @ user_mask)
|
| 146 |
+
staple_match = (staple_mat @ user_mask)
|
| 147 |
+
other_match = (other_mat @ user_mask)
|
| 148 |
+
|
| 149 |
+
main_ratio = main_match / np.maximum(main_total, 1)
|
| 150 |
+
staple_ratio = staple_match / np.maximum(staple_total, 1)
|
| 151 |
+
other_ratio = other_match / np.maximum(other_total, 1)
|
| 152 |
+
|
| 153 |
+
# === 5. Additional coarse ranking signals ===
|
| 154 |
+
# Low-calorie preference & preferred cuisine overlap
|
| 155 |
+
calories = np.array([r.get("calories", 0) for r in recipes], dtype=float)
|
| 156 |
+
calorie_threshold = user_profile.get("calorie_threshold", 9999)
|
| 157 |
+
low_calorie_penalty = (calories <= calorie_threshold).astype(float)
|
| 158 |
+
|
| 159 |
+
preferred_course_types = set(user_profile.get("preferred_course_types", []))
|
| 160 |
+
preferred_overlap = np.array([
|
| 161 |
+
len(set(r.get("cuisine_attr", [])) & preferred_course_types)
|
| 162 |
+
for r in recipes
|
| 163 |
+
], dtype=float)
|
| 164 |
+
|
| 165 |
+
# Region preference matching
|
| 166 |
+
preferred_regions = set(user_profile.get("region_preference", []))
|
| 167 |
+
region_match = np.array([
|
| 168 |
+
1.0 if any(region in preferred_regions for region in
|
| 169 |
+
(r.get("region", []) if isinstance(r.get("region"), (list, set))
|
| 170 |
+
else [r.get("region", "")]))
|
| 171 |
+
else 0.0
|
| 172 |
+
for r in recipes
|
| 173 |
+
], dtype=float)
|
| 174 |
+
|
| 175 |
+
# === Preferred main ingredients ===
|
| 176 |
+
preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
|
| 177 |
+
|
| 178 |
+
if preferred_main:
|
| 179 |
+
preferred_main_overlap = np.array([
|
| 180 |
+
len(set(r.get("main_parent", [])) & preferred_main)
|
| 181 |
+
for r in recipes
|
| 182 |
+
], dtype=float)
|
| 183 |
+
# print(f"[coarse_rank] Preferred main: {preferred_main}, matches: {np.sum(preferred_main_overlap > 0)}")
|
| 184 |
+
else:
|
| 185 |
+
preferred_main_overlap = np.zeros(len(recipes))
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# === 6. Compute coarse ranking scores ===
|
| 189 |
+
scores = (
|
| 190 |
+
weights["main_match_ratio"] * main_ratio +
|
| 191 |
+
weights["staple_match_ratio"] * staple_ratio +
|
| 192 |
+
weights["other_match_ratio"] * other_ratio +
|
| 193 |
+
weights["low_calorie_penalty"] * low_calorie_penalty +
|
| 194 |
+
weights["preferred_course_overlap"] * preferred_overlap +
|
| 195 |
+
weights.get("region_match", 0) * region_match +
|
| 196 |
+
weights.get("preferred_main_overlap", 0) * preferred_main_overlap
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# === 7. Select top-N candidates ===
|
| 200 |
+
valid_idx = np.where(scores > 0)[0]
|
| 201 |
+
if valid_idx.size == 0:
|
| 202 |
+
return []
|
| 203 |
+
|
| 204 |
+
scores_valid = scores[valid_idx]
|
| 205 |
+
topk = min(top_n, valid_idx.size)
|
| 206 |
+
|
| 207 |
+
# Optional dynamic thresholding: keep candidates with score >= 50% of max
|
| 208 |
+
max_score = scores_valid.max()
|
| 209 |
+
keep_mask = scores_valid >= max_score * 0.5
|
| 210 |
+
keep_idx = valid_idx[keep_mask]
|
| 211 |
+
|
| 212 |
+
if keep_idx.size == 0:
|
| 213 |
+
return []
|
| 214 |
+
|
| 215 |
+
order = np.argsort(scores[keep_idx])[::-1]
|
| 216 |
+
top_idx = keep_idx[order[:topk]]
|
| 217 |
+
|
| 218 |
+
# Return the original recipe dicts corresponding to the top candidates
|
| 219 |
+
return [recipes[i] for i in top_idx]
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def rule_generate_candidates(df, user_parents, user_profile):
|
| 223 |
+
"""
|
| 224 |
+
Step 3: Rule-based reranking of coarse candidates (vectorized).
|
| 225 |
+
This replaces the slow df.apply(score) loop with one-shot feature extraction.
|
| 226 |
+
"""
|
| 227 |
+
|
| 228 |
+
if df.empty:
|
| 229 |
+
return df
|
| 230 |
+
|
| 231 |
+
recipes_for_inference = []
|
| 232 |
+
for _, row in df.iterrows():
|
| 233 |
+
recipes_for_inference.append({
|
| 234 |
+
"main": row.get("main_parent", set()),
|
| 235 |
+
"staple": row.get("staple_parent", set()),
|
| 236 |
+
"other": row.get("other_parent", set()),
|
| 237 |
+
"seasoning": row.get("seasoning_parent", set()),
|
| 238 |
+
"matched_main": len(row.get("main_parent", set()) & set(user_parents)),
|
| 239 |
+
"matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
|
| 240 |
+
"matched_other": len(row.get("other_parent", set()) & set(user_parents)),
|
| 241 |
+
"calories": row.get("calories", 0),
|
| 242 |
+
"protein": row.get("protein", 0),
|
| 243 |
+
"fat": row.get("fat", 0),
|
| 244 |
+
"region": row.get("region", ""),
|
| 245 |
+
"cuisine_attr": row.get("cuisine_attr", []),
|
| 246 |
+
"ingredients": row.get("ingredients", []),
|
| 247 |
+
"minutes": row.get("minutes", None),
|
| 248 |
+
})
|
| 249 |
+
|
| 250 |
+
feats_df = extract_features(recipes_for_inference, user_profile)
|
| 251 |
+
|
| 252 |
+
scores = (
|
| 253 |
+
2.0 * feats_df["main_match_ratio"] +
|
| 254 |
+
1.0 * feats_df["staple_match_ratio"] +
|
| 255 |
+
1.0 * feats_df["other_match_ratio"]
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
if user_profile.get("low_calorie", False):
|
| 259 |
+
scores += 0.5 * feats_df["low_calorie_penalty"]
|
| 260 |
+
|
| 261 |
+
if user_profile.get("high_protein", False):
|
| 262 |
+
scores += 0.3 * (feats_df["protein_ratio"] > 0.25)
|
| 263 |
+
|
| 264 |
+
if user_profile.get("low_fat", False):
|
| 265 |
+
scores -= 0.3 * (feats_df["fat_ratio"] > 0.35)
|
| 266 |
+
|
| 267 |
+
scores += 0.5 * feats_df["region_match"]
|
| 268 |
+
scores += 0.4 * feats_df["preferred_course_overlap"]
|
| 269 |
+
scores += 0.3 * feats_df["preferred_main_overlap"]
|
| 270 |
+
scores += 0.3 * feats_df["within_cooking_time"]
|
| 271 |
+
scores -= 0.2 * feats_df["missing_main_count"]
|
| 272 |
+
|
| 273 |
+
df = df.copy()
|
| 274 |
+
df["match_score"] = np.maximum(scores, 0.0)
|
| 275 |
+
|
| 276 |
+
df = df[df["match_score"] > 0]
|
| 277 |
+
if df.empty:
|
| 278 |
+
return df
|
| 279 |
+
|
| 280 |
+
df = df.sort_values("match_score", ascending=False).reset_index(drop=True)
|
| 281 |
+
return df
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def ml_generate_candidates(coarse_candidates, user_parents, user_profile, model_path, topk=5):
|
| 285 |
+
"""
|
| 286 |
+
Step 3: ML-based reranking (directly after Step 2).
|
| 287 |
+
Instead of rule-based prefiltering, use the coarse-ranked candidates (Step 2 output),
|
| 288 |
+
build features in the same format as training, and apply the trained ML model to rerank.
|
| 289 |
+
"""
|
| 290 |
+
|
| 291 |
+
# Handle empty input
|
| 292 |
+
if coarse_candidates is None or len(coarse_candidates) == 0:
|
| 293 |
+
print("No candidates provided for ML reranking.")
|
| 294 |
+
return pd.DataFrame()
|
| 295 |
+
|
| 296 |
+
# If input is a list of dicts (from coarse_rank_candidates), convert to DataFrame
|
| 297 |
+
if isinstance(coarse_candidates, list):
|
| 298 |
+
df = pd.DataFrame(coarse_candidates)
|
| 299 |
+
else:
|
| 300 |
+
df = coarse_candidates.copy()
|
| 301 |
+
|
| 302 |
+
if df.empty:
|
| 303 |
+
print("Coarse candidates DataFrame is empty.")
|
| 304 |
+
return df
|
| 305 |
+
|
| 306 |
+
# Load trained model
|
| 307 |
+
model = joblib.load(model_path)
|
| 308 |
+
|
| 309 |
+
# Build feature DataFrame
|
| 310 |
+
recipes_for_inference = []
|
| 311 |
+
|
| 312 |
+
for _, row in df.iterrows():
|
| 313 |
+
recipes_for_inference.append({
|
| 314 |
+
"main": row.get("main_parent", set()),
|
| 315 |
+
"staple": row.get("staple_parent", set()),
|
| 316 |
+
"other": row.get("other_parent", set()),
|
| 317 |
+
"seasoning": row.get("seasoning_parent", set()),
|
| 318 |
+
"matched_main": len(row.get("main_parent", set()) & set(user_parents)),
|
| 319 |
+
"matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
|
| 320 |
+
"matched_other": len(row.get("other_parent", set()) & set(user_parents)),
|
| 321 |
+
"calories": row.get("calories", 0),
|
| 322 |
+
"protein": row.get("protein", 0),
|
| 323 |
+
"fat": row.get("fat", 0),
|
| 324 |
+
"region": row.get("region", ""),
|
| 325 |
+
"cuisine_attr": row.get("cuisine_attr", []),
|
| 326 |
+
"ingredients": row.get("ingredients", []),
|
| 327 |
+
"minutes": row.get("minutes", None),
|
| 328 |
+
})
|
| 329 |
+
|
| 330 |
+
feature_df = extract_features(recipes_for_inference, user_profile)
|
| 331 |
+
|
| 332 |
+
# Predict ML scores
|
| 333 |
+
if hasattr(model, "predict_proba"):
|
| 334 |
+
df["ml_score"] = model.predict_proba(feature_df)[:, 1]
|
| 335 |
+
else:
|
| 336 |
+
df["ml_score"] = model.predict(feature_df)
|
| 337 |
+
|
| 338 |
+
# normalize to 0-1
|
| 339 |
+
if len(df) > 0 and df["ml_score"].max() > df["ml_score"].min():
|
| 340 |
+
df["ml_score"] = (df["ml_score"] - df["ml_score"].min()) / (df["ml_score"].max() - df["ml_score"].min())
|
| 341 |
+
|
| 342 |
+
# Sort by ML score and return top-k candidates
|
| 343 |
+
return df.sort_values("ml_score", ascending=False).head(topk).reset_index(drop=True)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
|
recipe_recommendation/src/coldstart.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import ast
|
| 3 |
+
import json
|
| 4 |
+
import random
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import warnings
|
| 9 |
+
|
| 10 |
+
from .candidate import coarse_rank_candidates, hard_filter, rule_generate_candidates
|
| 11 |
+
from .feature import build_features
|
| 12 |
+
from .io import load_recipes_csv, load_ingredient_map
|
| 13 |
+
|
| 14 |
+
RECIPES_PATH = load_recipes_csv()
|
| 15 |
+
INGREDIENT_MAP = load_ingredient_map()
|
| 16 |
+
PARENTS = INGREDIENT_MAP["parents"]
|
| 17 |
+
CHILDREN = INGREDIENT_MAP["children"]
|
| 18 |
+
|
| 19 |
+
def parse_list(x):
|
| 20 |
+
"""Convert a stringified list into a Python list safely."""
|
| 21 |
+
if pd.isna(x) or x == "":
|
| 22 |
+
return []
|
| 23 |
+
if isinstance(x, list):
|
| 24 |
+
return x
|
| 25 |
+
try:
|
| 26 |
+
return ast.literal_eval(x)
|
| 27 |
+
except Exception:
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
def parse_set(x):
|
| 31 |
+
"""Convert a stringified collection into a Python set safely."""
|
| 32 |
+
if pd.isna(x) or x == "":
|
| 33 |
+
return set()
|
| 34 |
+
if isinstance(x, set):
|
| 35 |
+
return x
|
| 36 |
+
if isinstance(x, (list, tuple)):
|
| 37 |
+
return set(x)
|
| 38 |
+
if isinstance(x, str):
|
| 39 |
+
try:
|
| 40 |
+
v = ast.literal_eval(x)
|
| 41 |
+
if isinstance(v, (list, tuple, set)):
|
| 42 |
+
return set(v)
|
| 43 |
+
return {v}
|
| 44 |
+
except Exception:
|
| 45 |
+
return {x.strip()}
|
| 46 |
+
return {x}
|
| 47 |
+
|
| 48 |
+
def _parents_pool_from_df(df: pd.DataFrame):
|
| 49 |
+
cols = ["main_parent", "staple_parent", "other_parent", "seasoning_parent"]
|
| 50 |
+
pool = set()
|
| 51 |
+
for c in cols:
|
| 52 |
+
if c in df.columns:
|
| 53 |
+
for s in df[c]:
|
| 54 |
+
pool |= set(s) if isinstance(s, (set, list, tuple)) else set()
|
| 55 |
+
return sorted(pool)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def sample_user_parents(parents_pool,
|
| 59 |
+
user_profile=None,
|
| 60 |
+
prev_inventory=None,
|
| 61 |
+
min_items=3, max_items=10,
|
| 62 |
+
keep_ratio=0.6, reset_interval=20, round_idx=0):
|
| 63 |
+
liked = set((user_profile or {}).get("other_preferences", {}).get("preferred_main", []))
|
| 64 |
+
disliked = set((user_profile or {}).get("other_preferences", {}).get("disliked_main", []))
|
| 65 |
+
forbidden = set((user_profile or {}).get("forbidden_parents", [])) | disliked
|
| 66 |
+
|
| 67 |
+
pool, weights = [], []
|
| 68 |
+
for p in parents_pool:
|
| 69 |
+
if p in forbidden:
|
| 70 |
+
continue
|
| 71 |
+
w = 3.0 if p in liked else 1.0
|
| 72 |
+
pool.append(p); weights.append(w)
|
| 73 |
+
if not pool:
|
| 74 |
+
pool, weights = parents_pool[:], [1.0] * len(parents_pool)
|
| 75 |
+
|
| 76 |
+
inventory = set()
|
| 77 |
+
force_reset = (round_idx % reset_interval == 0)
|
| 78 |
+
if prev_inventory and not force_reset:
|
| 79 |
+
prev_list = list(prev_inventory); random.shuffle(prev_list)
|
| 80 |
+
keep_k = max(0, int(len(prev_list) * keep_ratio))
|
| 81 |
+
inventory |= set(prev_list[:keep_k])
|
| 82 |
+
|
| 83 |
+
k = random.randint(min_items, max_items)
|
| 84 |
+
remain = max(0, k - len(inventory))
|
| 85 |
+
for _ in range(min(remain, len(pool))):
|
| 86 |
+
idx = random.choices(range(len(pool)), weights=weights, k=1)[0]
|
| 87 |
+
inventory.add(pool[idx])
|
| 88 |
+
return list(inventory)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _weighted_pick3(indexes, scores, temperature=1.0):
|
| 92 |
+
idxs = list(indexes)
|
| 93 |
+
scs = np.array(scores, dtype=float)
|
| 94 |
+
if np.any(scs < 0):
|
| 95 |
+
scs = scs - scs.min()
|
| 96 |
+
if scs.sum() == 0:
|
| 97 |
+
scs = np.ones_like(scs)
|
| 98 |
+
picks = []
|
| 99 |
+
for _ in range(min(3, len(idxs))):
|
| 100 |
+
probs = np.exp(scs / max(temperature, 1e-6))
|
| 101 |
+
probs = probs / probs.sum()
|
| 102 |
+
choice = np.random.choice(len(idxs), p=probs)
|
| 103 |
+
picks.append(idxs[choice])
|
| 104 |
+
idxs.pop(choice)
|
| 105 |
+
scs = np.delete(scs, choice)
|
| 106 |
+
if len(idxs) == 0:
|
| 107 |
+
break
|
| 108 |
+
return picks
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ---------- Main cold-start ----------
|
| 112 |
+
# ---------- Main cold-start ----------
|
| 113 |
+
def cold_start_ranker(user_id: str,
|
| 114 |
+
n_rounds: int = 2000,
|
| 115 |
+
topn_coarse: int = 5000,
|
| 116 |
+
topk_rule: int = 3,
|
| 117 |
+
batch_size: int = 5000,
|
| 118 |
+
switch_interval: int = 100):
|
| 119 |
+
"""
|
| 120 |
+
Cold-start data generation for learning-to-rank.
|
| 121 |
+
Top-5 selection prioritizes user pantry coverage deterministically:
|
| 122 |
+
1. Fully covered recipes first (missing_count == 0)
|
| 123 |
+
2. Then few missing (esp. staple/other)
|
| 124 |
+
3. Heavy penalty for missing main ingredients.
|
| 125 |
+
"""
|
| 126 |
+
|
| 127 |
+
base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
|
| 128 |
+
if not os.path.exists(base_dir):
|
| 129 |
+
base_dir = os.path.join("recipe_recommendation", "input_user_data", user_id)
|
| 130 |
+
|
| 131 |
+
if not os.path.exists(base_dir):
|
| 132 |
+
raise FileNotFoundError(
|
| 133 |
+
f"❌ User profile not found for '{user_id}' in either 'recipe_recommendation/user_data' or 'recipe_recommendation/input_user_data'."
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
print(f"[cold_start_ranker] Using base_dir = {base_dir}")
|
| 137 |
+
|
| 138 |
+
profile_path = os.path.join(base_dir, "user_profile.json")
|
| 139 |
+
features_path = os.path.join(base_dir, "user_features_rank.csv")
|
| 140 |
+
|
| 141 |
+
if os.path.exists(features_path):
|
| 142 |
+
print(f"[cold_start] Features already exist at {features_path}")
|
| 143 |
+
return features_path
|
| 144 |
+
|
| 145 |
+
with open(profile_path, "r", encoding="utf-8") as f:
|
| 146 |
+
user_profile = json.load(f)
|
| 147 |
+
|
| 148 |
+
# Load and parse recipes
|
| 149 |
+
df_all = pd.read_csv(RECIPES_PATH)
|
| 150 |
+
to_set = ["main_parent", "staple_parent", "other_parent", "seasoning_parent", "cuisine_attr"]
|
| 151 |
+
to_list = ["ingredients"]
|
| 152 |
+
for c in to_set:
|
| 153 |
+
if c in df_all.columns:
|
| 154 |
+
df_all[c] = df_all[c].apply(parse_set)
|
| 155 |
+
for c in to_list:
|
| 156 |
+
if c in df_all.columns:
|
| 157 |
+
df_all[c] = df_all[c].apply(parse_list)
|
| 158 |
+
|
| 159 |
+
# Step 1 hard filter
|
| 160 |
+
if hard_filter is not None:
|
| 161 |
+
try:
|
| 162 |
+
before = len(df_all)
|
| 163 |
+
mask = df_all.apply(lambda r: hard_filter(r.to_dict(), user_profile), axis=1)
|
| 164 |
+
df_all = df_all[mask]
|
| 165 |
+
after = len(df_all)
|
| 166 |
+
print(f"[cold_start] Step1 hard filter applied: {before} -> {after}")
|
| 167 |
+
except Exception as e:
|
| 168 |
+
warnings.warn(f"[cold_start] hard_filter failed, skip. err={e}")
|
| 169 |
+
|
| 170 |
+
n_chunks = (len(df_all) // batch_size) + 1
|
| 171 |
+
chunks = np.array_split(df_all, n_chunks)
|
| 172 |
+
parents_pool = _parents_pool_from_df(df_all)
|
| 173 |
+
rows = []
|
| 174 |
+
prev_inventory = None
|
| 175 |
+
|
| 176 |
+
for i in tqdm(range(n_rounds), desc="Cold-start rounds"):
|
| 177 |
+
chunk_id = (i // switch_interval) % n_chunks
|
| 178 |
+
df_chunk = chunks[chunk_id].copy()
|
| 179 |
+
|
| 180 |
+
# pantry sampling
|
| 181 |
+
user_parents = sample_user_parents(
|
| 182 |
+
parents_pool,
|
| 183 |
+
user_profile=user_profile,
|
| 184 |
+
prev_inventory=prev_inventory,
|
| 185 |
+
round_idx=i
|
| 186 |
+
)
|
| 187 |
+
prev_inventory = user_parents
|
| 188 |
+
|
| 189 |
+
# Step 2: coarse recall
|
| 190 |
+
coarse_list = coarse_rank_candidates(
|
| 191 |
+
recipes=df_chunk.to_dict(orient="records"),
|
| 192 |
+
user_parents=user_parents,
|
| 193 |
+
user_profile=user_profile,
|
| 194 |
+
top_n=min(topn_coarse, len(df_chunk))
|
| 195 |
+
)
|
| 196 |
+
if not coarse_list:
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
coarse_df = pd.DataFrame(coarse_list)
|
| 200 |
+
|
| 201 |
+
# Step 3: rule rerank → Top-5 candidates (just for selecting the 5)
|
| 202 |
+
rule_df = rule_generate_candidates(
|
| 203 |
+
coarse_df,
|
| 204 |
+
user_parents=user_parents,
|
| 205 |
+
user_profile=user_profile
|
| 206 |
+
)
|
| 207 |
+
if rule_df.empty or len(rule_df) < topk_rule:
|
| 208 |
+
continue
|
| 209 |
+
|
| 210 |
+
top5 = rule_df.head(topk_rule).copy()
|
| 211 |
+
|
| 212 |
+
# ===== Deterministic scoring with feasibility + region + soft constraints =====
|
| 213 |
+
user_set = set(user_parents)
|
| 214 |
+
scored_candidates = []
|
| 215 |
+
|
| 216 |
+
# Nutrition goals (from profile)
|
| 217 |
+
ng = user_profile.get("nutritional_goals", {})
|
| 218 |
+
cal_min = ng.get("calories", {}).get("min", 0)
|
| 219 |
+
cal_max = ng.get("calories", {}).get("max", 1e9)
|
| 220 |
+
pro_min = ng.get("protein", {}).get("min", 0)
|
| 221 |
+
pro_max = ng.get("protein", {}).get("max", 1e9)
|
| 222 |
+
|
| 223 |
+
# Preferences
|
| 224 |
+
liked = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
|
| 225 |
+
disliked = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
|
| 226 |
+
max_cooking_time = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
|
| 227 |
+
|
| 228 |
+
for idx, row in top5.iterrows():
|
| 229 |
+
main_set = set(row.get("main_parent", set()))
|
| 230 |
+
staple_set = set(row.get("staple_parent", set()))
|
| 231 |
+
other_set = set(row.get("other_parent", set()))
|
| 232 |
+
|
| 233 |
+
main_total = len(main_set)
|
| 234 |
+
staple_total = len(staple_set)
|
| 235 |
+
main_match = len(main_set & user_set)
|
| 236 |
+
staple_match = len(staple_set & user_set)
|
| 237 |
+
|
| 238 |
+
# === 1) Feasibility check ===
|
| 239 |
+
total_needed = max(1, main_total + staple_total)
|
| 240 |
+
total_have = main_match + staple_match
|
| 241 |
+
coverage_ratio = total_have / total_needed
|
| 242 |
+
|
| 243 |
+
if coverage_ratio < 0.5:
|
| 244 |
+
continue
|
| 245 |
+
|
| 246 |
+
# === 2) Region preference ===
|
| 247 |
+
region_score = 1.0 if row.get("region_match", 0) else 0.0
|
| 248 |
+
|
| 249 |
+
# === 3) Cooking time soft constraint ===
|
| 250 |
+
time_val = row.get("minutes", None)
|
| 251 |
+
time_score = 0.0
|
| 252 |
+
if max_cooking_time and time_val is not None:
|
| 253 |
+
try:
|
| 254 |
+
t_val = float(time_val)
|
| 255 |
+
t_max = float(max_cooking_time)
|
| 256 |
+
lower_bound = 0.8 * t_max
|
| 257 |
+
upper_bound = 1.2 * t_max
|
| 258 |
+
if lower_bound <= t_val <= upper_bound:
|
| 259 |
+
time_score = 1.0
|
| 260 |
+
else:
|
| 261 |
+
deviation = abs(t_val - t_max) / t_max
|
| 262 |
+
time_score = max(0.0, 1.0 - deviation)
|
| 263 |
+
except (TypeError, ValueError):
|
| 264 |
+
time_score = 0.0
|
| 265 |
+
else:
|
| 266 |
+
time_score = 1.0
|
| 267 |
+
|
| 268 |
+
# === 4) Calories soft constraint ===
|
| 269 |
+
cal_val = row.get("calories", None)
|
| 270 |
+
cal_score = 1.0
|
| 271 |
+
if cal_val is not None and cal_min < cal_max:
|
| 272 |
+
try:
|
| 273 |
+
c_val = float(cal_val)
|
| 274 |
+
cal_center = 0.5 * (cal_min + cal_max)
|
| 275 |
+
tol = 0.3 * cal_center
|
| 276 |
+
lower_bound = cal_center - tol
|
| 277 |
+
upper_bound = cal_center + tol
|
| 278 |
+
if lower_bound <= c_val <= upper_bound:
|
| 279 |
+
cal_score = 1.0
|
| 280 |
+
else:
|
| 281 |
+
deviation = abs(c_val - cal_center) / cal_center
|
| 282 |
+
cal_score = max(0.0, 1.0 - deviation)
|
| 283 |
+
except (TypeError, ValueError):
|
| 284 |
+
cal_score = 0.0
|
| 285 |
+
|
| 286 |
+
# === 4b) Protein soft constraint ===
|
| 287 |
+
protein_val = row.get("protein", None)
|
| 288 |
+
protein_score = 1.0
|
| 289 |
+
if protein_val is not None and pro_min < pro_max:
|
| 290 |
+
try:
|
| 291 |
+
p_val = float(protein_val)
|
| 292 |
+
pro_center = 0.5 * (pro_min + pro_max)
|
| 293 |
+
tol = 0.2 * pro_center
|
| 294 |
+
lower_bound = pro_center - tol
|
| 295 |
+
upper_bound = pro_center + tol
|
| 296 |
+
if lower_bound <= p_val <= upper_bound:
|
| 297 |
+
protein_score = 1.0
|
| 298 |
+
else:
|
| 299 |
+
deviation = abs(p_val - pro_center) / pro_center
|
| 300 |
+
protein_score = max(0.0, 1.0 - deviation)
|
| 301 |
+
except (TypeError, ValueError):
|
| 302 |
+
protein_score = 0.0
|
| 303 |
+
|
| 304 |
+
# === 5) Liked / Disliked main ===
|
| 305 |
+
like_bonus = 1.0 if main_set & liked else 0.0
|
| 306 |
+
dislike_penalty = 1.0 if main_set & disliked else 0.0
|
| 307 |
+
|
| 308 |
+
# === 6) Final scoring ===
|
| 309 |
+
score = (
|
| 310 |
+
0.5 * coverage_ratio +
|
| 311 |
+
0.15 * region_score +
|
| 312 |
+
0.1 * time_score +
|
| 313 |
+
0.1 * cal_score +
|
| 314 |
+
0.05 * protein_score +
|
| 315 |
+
0.05 * like_bonus -
|
| 316 |
+
0.05 * dislike_penalty
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
scored_candidates.append((idx, score))
|
| 320 |
+
|
| 321 |
+
# Sort and pick top3 for relevance
|
| 322 |
+
scored_candidates.sort(key=lambda x: x[1], reverse=True)
|
| 323 |
+
picked_idxs = [idx for idx, _ in scored_candidates[:3]]
|
| 324 |
+
|
| 325 |
+
# relevance labels 3 / 2 / 1
|
| 326 |
+
labels = {idx: 0 for idx in top5.index}
|
| 327 |
+
if len(picked_idxs) > 0:
|
| 328 |
+
labels[picked_idxs[0]] = 3
|
| 329 |
+
if len(picked_idxs) > 1:
|
| 330 |
+
labels[picked_idxs[1]] = 2
|
| 331 |
+
if len(picked_idxs) > 2:
|
| 332 |
+
labels[picked_idxs[2]] = 1
|
| 333 |
+
|
| 334 |
+
# build features for all 5 candidates
|
| 335 |
+
for idx, row in top5.iterrows():
|
| 336 |
+
up = set(user_parents)
|
| 337 |
+
main_set = set(row.get("main_parent", set()))
|
| 338 |
+
staple_set = set(row.get("staple_parent", set()))
|
| 339 |
+
other_set = set(row.get("other_parent", set()))
|
| 340 |
+
|
| 341 |
+
recipe_dict = {
|
| 342 |
+
"main": main_set,
|
| 343 |
+
"staple": staple_set,
|
| 344 |
+
"other": other_set,
|
| 345 |
+
"seasoning": set(row.get("seasoning_parent", set())),
|
| 346 |
+
"matched_main": len(main_set & up),
|
| 347 |
+
"matched_staple": len(staple_set & up),
|
| 348 |
+
"matched_other": len(other_set & up),
|
| 349 |
+
"calories": row.get("calories", 0),
|
| 350 |
+
"protein": row.get("protein", 0),
|
| 351 |
+
"fat": row.get("fat", 0),
|
| 352 |
+
"region": row.get("region", ""),
|
| 353 |
+
"cuisine_attr": row.get("cuisine_attr", []),
|
| 354 |
+
"ingredients": row.get("ingredients", []),
|
| 355 |
+
"minutes": row.get("minutes", None),
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
feats = build_features(recipe_dict, user_profile)
|
| 359 |
+
feats["relevance"] = float(labels[idx])
|
| 360 |
+
feats["qid"] = int(i)
|
| 361 |
+
rows.append(feats)
|
| 362 |
+
|
| 363 |
+
out = pd.DataFrame(rows)
|
| 364 |
+
if "qid" not in out.columns or out.empty:
|
| 365 |
+
print(f"[cold_start] No valid training data generated for {user_id}, skipping save.")
|
| 366 |
+
return None
|
| 367 |
+
|
| 368 |
+
valid_qids = out.groupby("qid").size()
|
| 369 |
+
keep_qids = valid_qids[valid_qids > 1].index
|
| 370 |
+
out = out[out["qid"].isin(keep_qids)].reset_index(drop=True)
|
| 371 |
+
|
| 372 |
+
os.makedirs(base_dir, exist_ok=True)
|
| 373 |
+
out_path = os.path.join(base_dir, "user_features_rank.csv")
|
| 374 |
+
out.to_csv(out_path, index=False)
|
| 375 |
+
print(f"[cold_start] Saved {len(out)} rows to {out_path}")
|
| 376 |
+
return out_path
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
if __name__ == "__main__":
|
| 380 |
+
cold_start_ranker(
|
| 381 |
+
user_id="user_1",
|
| 382 |
+
n_rounds=10000,
|
| 383 |
+
topn_coarse=20000,
|
| 384 |
+
topk_rule=5,
|
| 385 |
+
coverage_penalty=0.15,
|
| 386 |
+
temperature=0.5
|
| 387 |
+
)
|
recipe_recommendation/src/embedding.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
|
| 6 |
+
def profile_to_embedding(profile):
|
| 7 |
+
"""
|
| 8 |
+
Convert a normalized user profile into a fixed-length numeric embedding.
|
| 9 |
+
Embedding structure:
|
| 10 |
+
[diet (3)] + [allergies (6)] + [region (6)] +
|
| 11 |
+
[nutritional goals (4)] + [preferred_main (8)] + [cooking_time (1)]
|
| 12 |
+
Total dim ≈ 28
|
| 13 |
+
"""
|
| 14 |
+
vecs = []
|
| 15 |
+
|
| 16 |
+
# 1. Diet (one-hot)
|
| 17 |
+
diet_types = ["vegetarian", "flexible", "non_vegetarian"]
|
| 18 |
+
diet_vec = np.zeros(len(diet_types))
|
| 19 |
+
diet_value = profile.get("diet", {}).get("vegetarian_type", "flexible")
|
| 20 |
+
if diet_value in diet_types:
|
| 21 |
+
diet_vec[diet_types.index(diet_value)] = 1
|
| 22 |
+
vecs.append(diet_vec)
|
| 23 |
+
|
| 24 |
+
# 2. Allergies (multi-hot)
|
| 25 |
+
allergy_vocab = ["milk", "gluten", "peanut", "shrimp", "egg", "soy"]
|
| 26 |
+
allergies = set(profile.get("allergies", []))
|
| 27 |
+
allergy_vec = np.array([1 if a in allergies else 0 for a in allergy_vocab])
|
| 28 |
+
vecs.append(allergy_vec)
|
| 29 |
+
|
| 30 |
+
# 3. Region preferences (multi-hot)
|
| 31 |
+
region_vocab = ["North America", "Latin America", "Europe", "Asia", "Middle East", "Africa"]
|
| 32 |
+
regions = set(profile.get("region_preference", []))
|
| 33 |
+
region_vec = np.array([1 if r in regions else 0 for r in region_vocab])
|
| 34 |
+
vecs.append(region_vec)
|
| 35 |
+
|
| 36 |
+
# 4. Nutritional goals (normalized)
|
| 37 |
+
ng = profile.get("nutritional_goals", {})
|
| 38 |
+
cal = ng.get("calories", {})
|
| 39 |
+
pro = ng.get("protein", {})
|
| 40 |
+
|
| 41 |
+
cal_min = cal.get("min", 0) / 4000
|
| 42 |
+
cal_max = min(cal.get("max", 9999), 4000) / 4000
|
| 43 |
+
pro_min = pro.get("min", 0) / 300
|
| 44 |
+
pro_max = min(pro.get("max", 999), 300) / 300
|
| 45 |
+
|
| 46 |
+
vecs.append(np.array([cal_min, cal_max, pro_min, pro_max]))
|
| 47 |
+
|
| 48 |
+
# 5. Preferred main ingredients (multi-hot)
|
| 49 |
+
main_vocab = ["chicken", "tofu", "beef", "salmon", "eggs", "pork", "beans", "mushroom"]
|
| 50 |
+
mains = set(profile.get("other_preferences", {}).get("preferred_main", []))
|
| 51 |
+
main_vec = np.array([1 if m in mains else 0 for m in main_vocab])
|
| 52 |
+
vecs.append(main_vec)
|
| 53 |
+
|
| 54 |
+
# 6. Cooking time max (normalized to [0,1], assume 120 min upper bound)
|
| 55 |
+
t = profile.get("other_preferences", {}).get("cooking_time_max")
|
| 56 |
+
t_vec = np.array([min(t / 120, 1)]) if t is not None else np.array([0])
|
| 57 |
+
vecs.append(t_vec)
|
| 58 |
+
|
| 59 |
+
return np.concatenate(vecs)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def profile_similarity(profile_a, profile_b):
|
| 63 |
+
"""Compute cosine similarity between two user profiles."""
|
| 64 |
+
emb_a = profile_to_embedding(profile_a).reshape(1, -1)
|
| 65 |
+
emb_b = profile_to_embedding(profile_b).reshape(1, -1)
|
| 66 |
+
return cosine_similarity(emb_a, emb_b)[0, 0]
|
| 67 |
+
|
| 68 |
+
def find_most_similar_user(target_user_id, user_data_dir="recipe_recommendation/user_data", threshold=0.85):
|
| 69 |
+
"""
|
| 70 |
+
Find the most similar existing user based on profile embeddings.
|
| 71 |
+
Returns (best_match_user_id, similarity_score) or (None, -1) if no match.
|
| 72 |
+
"""
|
| 73 |
+
target_profile_path = os.path.join(user_data_dir, target_user_id, "user_profile.json")
|
| 74 |
+
if not os.path.exists(target_profile_path):
|
| 75 |
+
raise FileNotFoundError(f"[embedding] No profile found for user {target_user_id}")
|
| 76 |
+
|
| 77 |
+
with open(target_profile_path, "r", encoding="utf-8") as f:
|
| 78 |
+
target_profile = json.load(f)
|
| 79 |
+
target_emb = profile_to_embedding(target_profile).reshape(1, -1)
|
| 80 |
+
|
| 81 |
+
best_match, best_score = None, -1
|
| 82 |
+
|
| 83 |
+
for uid in os.listdir(user_data_dir):
|
| 84 |
+
if uid == target_user_id:
|
| 85 |
+
continue
|
| 86 |
+
profile_path = os.path.join(user_data_dir, uid, "user_profile.json")
|
| 87 |
+
if not os.path.exists(profile_path):
|
| 88 |
+
continue
|
| 89 |
+
with open(profile_path, "r", encoding="utf-8") as f:
|
| 90 |
+
other_profile = json.load(f)
|
| 91 |
+
other_emb = profile_to_embedding(other_profile).reshape(1, -1)
|
| 92 |
+
sim = cosine_similarity(target_emb, other_emb)[0, 0]
|
| 93 |
+
if sim > best_score:
|
| 94 |
+
best_match, best_score = uid, sim
|
| 95 |
+
|
| 96 |
+
if best_match and best_score >= threshold:
|
| 97 |
+
print(f"[embedding] Found similar user: {best_match} (similarity={best_score:.3f})")
|
| 98 |
+
return best_match, best_score
|
| 99 |
+
|
| 100 |
+
return None, -1
|
recipe_recommendation/src/feature.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from .io import load_ingredient_map
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# Load ingredient map globally to avoid repeated I/O
|
| 7 |
+
INGREDIENT_MAP = load_ingredient_map()
|
| 8 |
+
PARENTS = INGREDIENT_MAP["parents"]
|
| 9 |
+
CHILDREN = INGREDIENT_MAP["children"]
|
| 10 |
+
|
| 11 |
+
FEATURE_COLS = [
|
| 12 |
+
"main_match_ratio", "other_match_ratio", "staple_match_ratio",
|
| 13 |
+
"missing_main_count", "missing_other_count", "missing_staple_count",
|
| 14 |
+
"calories", "protein", "fat", "protein_ratio", "fat_ratio",
|
| 15 |
+
"region_match",
|
| 16 |
+
"is_vegan_safe", "is_vegetarian_safe_absolute", "is_flexible_safe_absolute", "is_user_diet_safe",
|
| 17 |
+
"preferred_main_overlap", "disliked_main_overlap",
|
| 18 |
+
"preferred_course_overlap",
|
| 19 |
+
"within_cooking_time", "cooking_time_over",
|
| 20 |
+
"calories_value", "calories_deviation",
|
| 21 |
+
"protein_value", "protein_deviation",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def extract_features(recipes: list[dict], user_profile: dict):
|
| 26 |
+
"""
|
| 27 |
+
Convert a list of recipes into a feature matrix for ML model.
|
| 28 |
+
Ensures feature columns are aligned with FEATURE_COLS.
|
| 29 |
+
"""
|
| 30 |
+
rows = [build_features(r, user_profile) for r in recipes]
|
| 31 |
+
df = pd.DataFrame(rows)
|
| 32 |
+
df = df.reindex(columns=FEATURE_COLS, fill_value=0)
|
| 33 |
+
assert list(df.columns) == FEATURE_COLS, "Feature columns mismatch!"
|
| 34 |
+
return df
|
| 35 |
+
|
| 36 |
+
def is_recipe_vegetarian_safe(ingredients: list[str], veg_type: str) -> bool:
|
| 37 |
+
"""
|
| 38 |
+
Check if the recipe is safe for a given dietary type.
|
| 39 |
+
Supported veg_type: "vegan", "vegetarian", "flexible_vegetarian", "" (none).
|
| 40 |
+
"""
|
| 41 |
+
for ing in ingredients:
|
| 42 |
+
ing_lower = ing.strip().lower()
|
| 43 |
+
if ing_lower in CHILDREN:
|
| 44 |
+
info = CHILDREN[ing_lower]
|
| 45 |
+
elif ing_lower in PARENTS:
|
| 46 |
+
info = PARENTS[ing_lower]
|
| 47 |
+
else:
|
| 48 |
+
# If the ingredient is not found in the map, treat it as safe by default.
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
if veg_type == "vegan" and not info.get("vegan_safe", True):
|
| 52 |
+
return False
|
| 53 |
+
if veg_type == "vegetarian" and not info.get("vegetarian_safe", True):
|
| 54 |
+
return False
|
| 55 |
+
if veg_type == "flexible_vegetarian":
|
| 56 |
+
# Flexible vegetarians allow most ingredients except explicit meat.
|
| 57 |
+
# Here, we can use vegetarian_safe as a proxy for flexibility.
|
| 58 |
+
if not info.get("vegetarian_safe", True):
|
| 59 |
+
return False
|
| 60 |
+
return True
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def build_features(recipe: dict, user_profile: dict) -> dict:
|
| 64 |
+
"""
|
| 65 |
+
Build a feature dictionary for ML ranker and rule-based scoring.
|
| 66 |
+
All features are numeric scalars or counts.
|
| 67 |
+
"""
|
| 68 |
+
features = {}
|
| 69 |
+
|
| 70 |
+
# ======================================================
|
| 71 |
+
# 1. Ingredient matching ratios
|
| 72 |
+
# ======================================================
|
| 73 |
+
total_main = len(recipe.get("main", []))
|
| 74 |
+
total_other = len(recipe.get("other", []))
|
| 75 |
+
total_staple = len(recipe.get("staple", []))
|
| 76 |
+
|
| 77 |
+
features["main_match_ratio"] = recipe.get("matched_main", 0) / max(total_main, 1)
|
| 78 |
+
features["other_match_ratio"] = recipe.get("matched_other", 0) / max(total_other, 1)
|
| 79 |
+
features["staple_match_ratio"] = recipe.get("matched_staple", 0) / max(total_staple, 1)
|
| 80 |
+
|
| 81 |
+
features["missing_main_count"] = total_main - recipe.get("matched_main", 0)
|
| 82 |
+
features["missing_other_count"] = total_other - recipe.get("matched_other", 0)
|
| 83 |
+
features["missing_staple_count"] = total_staple - recipe.get("matched_staple", 0)
|
| 84 |
+
|
| 85 |
+
# ======================================================
|
| 86 |
+
# 2. Basic nutrition info
|
| 87 |
+
# ======================================================
|
| 88 |
+
calories = recipe.get("calories", 0.0) or 0.0
|
| 89 |
+
protein = recipe.get("protein", 0.0) or 0.0
|
| 90 |
+
fat = recipe.get("fat", 0.0) or 0.0
|
| 91 |
+
|
| 92 |
+
features["calories"] = calories
|
| 93 |
+
features["protein"] = protein
|
| 94 |
+
features["fat"] = fat
|
| 95 |
+
features["protein_ratio"] = protein / max(calories, 1)
|
| 96 |
+
features["fat_ratio"] = fat / max(calories, 1)
|
| 97 |
+
|
| 98 |
+
# ======================================================
|
| 99 |
+
# 3. Region preference
|
| 100 |
+
# ======================================================
|
| 101 |
+
recipe_region = recipe.get("region", [])
|
| 102 |
+
if recipe_region is None or recipe_region == "" or (isinstance(recipe_region, float) and np.isnan(recipe_region)):
|
| 103 |
+
recipe_regions = []
|
| 104 |
+
elif isinstance(recipe_region, (set, list, tuple)):
|
| 105 |
+
recipe_regions = list(recipe_region)
|
| 106 |
+
else:
|
| 107 |
+
recipe_regions = [recipe_region]
|
| 108 |
+
|
| 109 |
+
user_regions = user_profile.get("region_preference", [])
|
| 110 |
+
if isinstance(user_regions, str):
|
| 111 |
+
user_regions = [user_regions]
|
| 112 |
+
|
| 113 |
+
recipe_regions_norm = {str(r).strip().lower() for r in recipe_regions if r}
|
| 114 |
+
user_regions_norm = {str(r).strip().lower() for r in user_regions if r}
|
| 115 |
+
|
| 116 |
+
features["region_match"] = int(len(recipe_regions_norm & user_regions_norm) > 0)
|
| 117 |
+
|
| 118 |
+
# ======================================================
|
| 119 |
+
# 4. Diet constraints
|
| 120 |
+
# ======================================================
|
| 121 |
+
ingredients_all = recipe.get("ingredients", [])
|
| 122 |
+
|
| 123 |
+
features["is_vegan_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegan"))
|
| 124 |
+
features["is_vegetarian_safe_absolute"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegetarian"))
|
| 125 |
+
features["is_flexible_safe_absolute"] = int(is_recipe_vegetarian_safe(ingredients_all, "flexible_vegetarian"))
|
| 126 |
+
|
| 127 |
+
veg_type = (user_profile.get("diet", {}).get("vegetarian_type", "") or "").lower()
|
| 128 |
+
features["is_user_diet_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, veg_type))
|
| 129 |
+
|
| 130 |
+
# ======================================================
|
| 131 |
+
# 5. Preferred & disliked main
|
| 132 |
+
# ======================================================
|
| 133 |
+
recipe_main = set(recipe.get("main", []))
|
| 134 |
+
preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
|
| 135 |
+
disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
|
| 136 |
+
|
| 137 |
+
features["preferred_main_overlap"] = 1.0 if recipe_main & preferred_main else 0.0
|
| 138 |
+
features["disliked_main_overlap"] = 1.0 if recipe_main & disliked_main else 0.0
|
| 139 |
+
|
| 140 |
+
# ======================================================
|
| 141 |
+
# 6. Course type preference
|
| 142 |
+
# ======================================================
|
| 143 |
+
recipe_types = set(recipe.get("cuisine_attr", []))
|
| 144 |
+
preferred_types = set(user_profile.get("preferred_course_types", []))
|
| 145 |
+
features["preferred_course_overlap"] = len(recipe_types & preferred_types)
|
| 146 |
+
|
| 147 |
+
# ======================================================
|
| 148 |
+
# 7. Cooking time features
|
| 149 |
+
# ======================================================
|
| 150 |
+
max_time = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
|
| 151 |
+
recipe_time = recipe.get("minutes", None)
|
| 152 |
+
|
| 153 |
+
if max_time is not None and recipe_time is not None:
|
| 154 |
+
try:
|
| 155 |
+
recipe_time_val = float(recipe_time)
|
| 156 |
+
max_time_val = float(max_time)
|
| 157 |
+
features["within_cooking_time"] = 1.0 if recipe_time_val <= max_time_val else 0.0
|
| 158 |
+
features["cooking_time_over"] = max(0.0, recipe_time_val - max_time_val)
|
| 159 |
+
except (TypeError, ValueError):
|
| 160 |
+
features["within_cooking_time"] = 0.0
|
| 161 |
+
features["cooking_time_over"] = 0.0
|
| 162 |
+
else:
|
| 163 |
+
features["within_cooking_time"] = 1.0
|
| 164 |
+
features["cooking_time_over"] = 0.0
|
| 165 |
+
|
| 166 |
+
# ======================================================
|
| 167 |
+
# 8. Calories / Protein deviation features
|
| 168 |
+
# ======================================================
|
| 169 |
+
ng = user_profile.get("nutritional_goals", {})
|
| 170 |
+
cal_min = ng.get("calories", {}).get("min", 0)
|
| 171 |
+
cal_max = ng.get("calories", {}).get("max", 1e9)
|
| 172 |
+
pro_min = ng.get("protein", {}).get("min", 0)
|
| 173 |
+
pro_max = ng.get("protein", {}).get("max", 1e9)
|
| 174 |
+
|
| 175 |
+
# --- Calories deviation ---
|
| 176 |
+
if calories is not None and cal_min < cal_max:
|
| 177 |
+
try:
|
| 178 |
+
cal_center = 0.5 * (cal_min + cal_max)
|
| 179 |
+
features["calories_value"] = float(calories)
|
| 180 |
+
features["calories_deviation"] = (float(calories) - cal_center) / cal_center
|
| 181 |
+
except (TypeError, ValueError):
|
| 182 |
+
features["calories_value"] = 0.0
|
| 183 |
+
features["calories_deviation"] = 0.0
|
| 184 |
+
else:
|
| 185 |
+
features["calories_value"] = 0.0
|
| 186 |
+
features["calories_deviation"] = 0.0
|
| 187 |
+
|
| 188 |
+
# --- Protein deviation ---
|
| 189 |
+
if protein is not None and pro_min < pro_max:
|
| 190 |
+
try:
|
| 191 |
+
pro_center = 0.5 * (pro_min + pro_max)
|
| 192 |
+
features["protein_value"] = float(protein)
|
| 193 |
+
features["protein_deviation"] = (float(protein) - pro_center) / pro_center
|
| 194 |
+
except (TypeError, ValueError):
|
| 195 |
+
features["protein_value"] = 0.0
|
| 196 |
+
features["protein_deviation"] = 0.0
|
| 197 |
+
else:
|
| 198 |
+
features["protein_value"] = 0.0
|
| 199 |
+
features["protein_deviation"] = 0.0
|
| 200 |
+
|
| 201 |
+
return features
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def build_cluster_features(candidates):
|
| 205 |
+
"""
|
| 206 |
+
Build simple ingredient + cuisine based feature vectors for KMeans clustering.
|
| 207 |
+
This is separate from model training features.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
candidates (list[dict]): list of recipe dicts.
|
| 211 |
+
|
| 212 |
+
Returns:
|
| 213 |
+
np.ndarray: feature matrix (num_candidates, num_features)
|
| 214 |
+
"""
|
| 215 |
+
# 1. Collect vocabulary for ingredients and cuisine
|
| 216 |
+
all_main = set()
|
| 217 |
+
all_staple = set()
|
| 218 |
+
all_other = set()
|
| 219 |
+
all_cuisine = set()
|
| 220 |
+
|
| 221 |
+
for r in candidates:
|
| 222 |
+
all_main.update(r.get("main_parent", []) or [])
|
| 223 |
+
all_staple.update(r.get("staple_parent", []) or [])
|
| 224 |
+
all_other.update(r.get("other_parent", []) or [])
|
| 225 |
+
all_cuisine.update(r.get("cuisine_attr", []) or [])
|
| 226 |
+
|
| 227 |
+
main_vocab = sorted(all_main)
|
| 228 |
+
staple_vocab = sorted(all_staple)
|
| 229 |
+
other_vocab = sorted(all_other)
|
| 230 |
+
cuisine_vocab = sorted(all_cuisine)
|
| 231 |
+
|
| 232 |
+
# 2. Build index map
|
| 233 |
+
main_idx = {p: i for i, p in enumerate(main_vocab)}
|
| 234 |
+
staple_idx = {p: i + len(main_vocab) for i, p in enumerate(staple_vocab)}
|
| 235 |
+
other_idx = {p: i + len(main_vocab) + len(staple_vocab) for i, p in enumerate(other_vocab)}
|
| 236 |
+
cuisine_idx = {p: i + len(main_vocab) + len(staple_vocab) + len(other_vocab)
|
| 237 |
+
for i, p in enumerate(cuisine_vocab)}
|
| 238 |
+
|
| 239 |
+
dim = len(main_vocab) + len(staple_vocab) + len(other_vocab) + len(cuisine_vocab)
|
| 240 |
+
X = np.zeros((len(candidates), dim), dtype=np.uint8)
|
| 241 |
+
|
| 242 |
+
# 3. Fill feature matrix
|
| 243 |
+
for i, r in enumerate(candidates):
|
| 244 |
+
for p in r.get("main_parent", []) or []:
|
| 245 |
+
if p in main_idx:
|
| 246 |
+
X[i, main_idx[p]] = 1
|
| 247 |
+
for p in r.get("staple_parent", []) or []:
|
| 248 |
+
if p in staple_idx:
|
| 249 |
+
X[i, staple_idx[p]] = 1
|
| 250 |
+
for p in r.get("other_parent", []) or []:
|
| 251 |
+
if p in other_idx:
|
| 252 |
+
X[i, other_idx[p]] = 1
|
| 253 |
+
for p in r.get("cuisine_attr", []) or []:
|
| 254 |
+
if p in cuisine_idx:
|
| 255 |
+
X[i, cuisine_idx[p]] = 1
|
| 256 |
+
|
| 257 |
+
return X
|
recipe_recommendation/src/highlight.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.cluster import KMeans
|
| 3 |
+
from sklearn.preprocessing import StandardScaler
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def print_candidates(candidates, user_parents, topk=10):
|
| 8 |
+
shown = 0
|
| 9 |
+
max_score = candidates['match_score'].max()
|
| 10 |
+
min_score = candidates['match_score'].min()
|
| 11 |
+
|
| 12 |
+
for _, row in candidates.head(topk).iterrows():
|
| 13 |
+
scaled_score = 100 * row['match_score'] / (max_score + 1e-9)
|
| 14 |
+
print(f"{row['name']} (score {scaled_score:.1f}%)")
|
| 15 |
+
|
| 16 |
+
# ----- Region -----
|
| 17 |
+
region = row.get("region", None)
|
| 18 |
+
if pd.notna(region) and isinstance(region, str) and region.strip() and region.lower() != "unavailable":
|
| 19 |
+
print(f" region: {region}")
|
| 20 |
+
|
| 21 |
+
# ----- Cuisine Attributes -----
|
| 22 |
+
cuisine = row.get("cuisine_attr", None)
|
| 23 |
+
if cuisine is not None and not (isinstance(cuisine, float) and pd.isna(cuisine)):
|
| 24 |
+
# Convert set to list for printing
|
| 25 |
+
if isinstance(cuisine, set):
|
| 26 |
+
cuisine = list(cuisine)
|
| 27 |
+
elif isinstance(cuisine, str):
|
| 28 |
+
cuisine = [cuisine]
|
| 29 |
+
|
| 30 |
+
if isinstance(cuisine, list) and len(cuisine) > 0:
|
| 31 |
+
print(f" cuisine: {', '.join(cuisine)}")
|
| 32 |
+
|
| 33 |
+
# ----- Nutrition -----
|
| 34 |
+
print(f" calories: {row.get('calories', 'N/A')}")
|
| 35 |
+
|
| 36 |
+
# ----- Ingredient Marking -----
|
| 37 |
+
def mark_list(lst):
|
| 38 |
+
return [("✅ " + ing) if ing in user_parents else ("❌ " + ing) for ing in lst]
|
| 39 |
+
|
| 40 |
+
print(f" staple: {mark_list(row.get('staple_parent', []))}")
|
| 41 |
+
print(f" main: {mark_list(row.get('main_parent', []))}")
|
| 42 |
+
print(f" seasoning: {row.get('seasoning_parent', [])}")
|
| 43 |
+
print(f" other: {mark_list(row.get('other_parent', []))}")
|
| 44 |
+
print("-" * 40)
|
| 45 |
+
|
| 46 |
+
shown += 1
|
| 47 |
+
|
| 48 |
+
def diversify_topk_with_min_clusters(
|
| 49 |
+
ranked_candidates,
|
| 50 |
+
feature_matrix,
|
| 51 |
+
top_k=5,
|
| 52 |
+
n_clusters=20,
|
| 53 |
+
min_clusters=3,
|
| 54 |
+
random_state=42
|
| 55 |
+
):
|
| 56 |
+
"""
|
| 57 |
+
Diversify top-k displayed recipes using KMeans clustering.
|
| 58 |
+
Ensures that the final top_k contains at least `min_clusters` distinct clusters.
|
| 59 |
+
"""
|
| 60 |
+
if len(ranked_candidates) == 0:
|
| 61 |
+
return []
|
| 62 |
+
|
| 63 |
+
n_clusters = min(n_clusters, len(ranked_candidates))
|
| 64 |
+
scaler = StandardScaler()
|
| 65 |
+
X_scaled = scaler.fit_transform(feature_matrix)
|
| 66 |
+
|
| 67 |
+
# KMeans clustering
|
| 68 |
+
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=random_state)
|
| 69 |
+
cluster_ids = kmeans.fit_predict(X_scaled)
|
| 70 |
+
|
| 71 |
+
# Step 1: pick candidates from distinct clusters until min_clusters reached
|
| 72 |
+
picked = []
|
| 73 |
+
picked_clusters = set()
|
| 74 |
+
for i, c in enumerate(cluster_ids):
|
| 75 |
+
if c not in picked_clusters:
|
| 76 |
+
picked.append(ranked_candidates[i])
|
| 77 |
+
picked_clusters.add(c)
|
| 78 |
+
if len(picked_clusters) >= min_clusters or len(picked) >= top_k:
|
| 79 |
+
break
|
| 80 |
+
|
| 81 |
+
# Step 2: fill the rest purely by rank order
|
| 82 |
+
if len(picked) < top_k:
|
| 83 |
+
for i, c in enumerate(cluster_ids):
|
| 84 |
+
if ranked_candidates[i] not in picked:
|
| 85 |
+
picked.append(ranked_candidates[i])
|
| 86 |
+
if len(picked) >= top_k:
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
return picked
|
| 90 |
+
|
| 91 |
+
|
recipe_recommendation/src/io.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from huggingface_hub import hf_hub_download
|
| 4 |
+
|
| 5 |
+
# Hugging Face ID
|
| 6 |
+
REPO_ID = "Iris314/recipe-cleaned"
|
| 7 |
+
|
| 8 |
+
ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 9 |
+
DATA_DIR = os.path.join(ROOT_DIR, "data")
|
| 10 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def download_file(filename: str) -> str:
|
| 14 |
+
|
| 15 |
+
local_path = os.path.join(DATA_DIR, filename)
|
| 16 |
+
if not os.path.exists(local_path):
|
| 17 |
+
print(f"Downloading {filename} from Hugging Face Hub...")
|
| 18 |
+
hf_hub_download(
|
| 19 |
+
repo_id=REPO_ID,
|
| 20 |
+
filename=filename,
|
| 21 |
+
repo_type="dataset",
|
| 22 |
+
local_dir=DATA_DIR,
|
| 23 |
+
local_dir_use_symlinks=False
|
| 24 |
+
)
|
| 25 |
+
else:
|
| 26 |
+
print(f"{filename} already exists locally.")
|
| 27 |
+
return local_path
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def load_recipes_csv() -> str:
|
| 31 |
+
return download_file("recipes.csv")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_ingredient_map() -> dict:
|
| 35 |
+
path = download_file("ingredient_map.data")
|
| 36 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 37 |
+
return json.load(f)
|
recipe_recommendation/src/trainmodel.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import joblib
|
| 3 |
+
import warnings
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from typing import List, Tuple, Sequence, Optional
|
| 7 |
+
from xgboost import XGBRanker
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.metrics import ndcg_score
|
| 10 |
+
from pandas.api.types import is_numeric_dtype
|
| 11 |
+
from .feature import FEATURE_COLS
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ----------------------------- Helpers -----------------------------
|
| 17 |
+
def _pick_feature_cols(df: pd.DataFrame, drop_cols: Sequence[str]) -> List[str]:
|
| 18 |
+
"""
|
| 19 |
+
Pick numeric feature columns robustly, excluding drop_cols.
|
| 20 |
+
Uses pandas is_numeric_dtype to correctly include nullable ints/floats/bools.
|
| 21 |
+
"""
|
| 22 |
+
cols = []
|
| 23 |
+
for c in df.columns:
|
| 24 |
+
if c in drop_cols:
|
| 25 |
+
continue
|
| 26 |
+
if is_numeric_dtype(df[c]):
|
| 27 |
+
cols.append(c)
|
| 28 |
+
return cols
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _sort_and_pack_by_qid(
|
| 32 |
+
X: pd.DataFrame, y: pd.Series, qid: pd.Series, feature_cols: List[str]
|
| 33 |
+
) -> Tuple[pd.DataFrame, np.ndarray, List[int], np.ndarray]:
|
| 34 |
+
"""
|
| 35 |
+
Sort rows by qid so that group sizes match the sample order.
|
| 36 |
+
Returns:
|
| 37 |
+
X_sorted, y_sorted, groups, qid_sorted (aligned with X_sorted/y_sorted)
|
| 38 |
+
"""
|
| 39 |
+
packed = X.copy()
|
| 40 |
+
packed["_label"] = y.values
|
| 41 |
+
packed["_qid"] = qid.values
|
| 42 |
+
packed = packed.sort_values("_qid").reset_index(drop=True)
|
| 43 |
+
|
| 44 |
+
groups = packed.groupby("_qid").size().tolist()
|
| 45 |
+
X_sorted = packed[feature_cols].copy()
|
| 46 |
+
y_sorted = packed["_label"].astype(float).values
|
| 47 |
+
qid_sorted = packed["_qid"].values
|
| 48 |
+
return X_sorted, y_sorted, groups, qid_sorted
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _eval_mean_ndcg(
|
| 52 |
+
model: XGBRanker,
|
| 53 |
+
X_val: pd.DataFrame,
|
| 54 |
+
y_val, # can be np.ndarray or pd.Series
|
| 55 |
+
qid_val, # aligned with X_val/y_val
|
| 56 |
+
ks: Sequence[int] = (5, 10),
|
| 57 |
+
) -> dict:
|
| 58 |
+
"""
|
| 59 |
+
Compute mean NDCG@k for each k in ks over validation queries.
|
| 60 |
+
Accepts numpy arrays or pandas Series.
|
| 61 |
+
"""
|
| 62 |
+
# Try to respect early-stopping best iteration if available (xgboost>=2.0)
|
| 63 |
+
try:
|
| 64 |
+
preds = model.predict(X_val, iteration_range=(0, model.best_iteration + 1))
|
| 65 |
+
except Exception:
|
| 66 |
+
preds = model.predict(X_val)
|
| 67 |
+
|
| 68 |
+
y_arr = np.asarray(y_val)
|
| 69 |
+
q_arr = np.asarray(qid_val)
|
| 70 |
+
|
| 71 |
+
out = {}
|
| 72 |
+
for k in ks:
|
| 73 |
+
ndcgs = []
|
| 74 |
+
for q in np.unique(q_arr):
|
| 75 |
+
mask = (q_arr == q)
|
| 76 |
+
if mask.sum() < 2:
|
| 77 |
+
continue
|
| 78 |
+
ndcgs.append(ndcg_score([y_arr[mask]], [preds[mask]], k=k))
|
| 79 |
+
out[f"NDCG@{k}"] = float(np.mean(ndcgs)) if ndcgs else 0.0
|
| 80 |
+
return out
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ----------------------------- Main Trainer -----------------------------
|
| 85 |
+
def train_model_ranker(
|
| 86 |
+
user_id: str = "user_1",
|
| 87 |
+
features_path: Optional[str] = None,
|
| 88 |
+
save_model: bool = True,
|
| 89 |
+
model_params: Optional[dict] = None,
|
| 90 |
+
val_ratio: float = 0.2,
|
| 91 |
+
random_state: int = 42,
|
| 92 |
+
max_rows: Optional[int] = None,
|
| 93 |
+
):
|
| 94 |
+
"""
|
| 95 |
+
Train an XGBoost Learning-to-Rank model (XGBRanker) on cold-start generated data.
|
| 96 |
+
|
| 97 |
+
Expected input CSV (from cold_start.py):
|
| 98 |
+
- qid: query id (one round of pantry sampling = one query)
|
| 99 |
+
- relevance: graded relevance label (e.g., 3/2/1/0)
|
| 100 |
+
- features: numeric columns produced by build_features (and any extra numeric signals)
|
| 101 |
+
|
| 102 |
+
The function:
|
| 103 |
+
1) Reads the CSV
|
| 104 |
+
2) Selects numeric feature columns robustly
|
| 105 |
+
3) Splits train/val by qid to avoid leakage
|
| 106 |
+
4) Sorts each split by qid and builds group sizes aligned to sample order
|
| 107 |
+
5) Trains XGBRanker and reports NDCG@5/10
|
| 108 |
+
6) Saves model to user_data/<user_id>/ranker.pkl
|
| 109 |
+
"""
|
| 110 |
+
base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
|
| 111 |
+
os.makedirs(base_dir, exist_ok=True)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# Resolve features path
|
| 115 |
+
if features_path is None:
|
| 116 |
+
features_path = os.path.join(base_dir, "user_features_rank.csv")
|
| 117 |
+
if not os.path.exists(features_path):
|
| 118 |
+
raise FileNotFoundError(
|
| 119 |
+
f"[train_model_ranker] Cold-start features not found at: {features_path}\n"
|
| 120 |
+
f"Please run cold_start_ranker(user_id='{user_id}') first."
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Load data
|
| 124 |
+
df = pd.read_csv(features_path)
|
| 125 |
+
if max_rows is not None and len(df) > max_rows:
|
| 126 |
+
df = df.sample(max_rows, random_state=random_state).reset_index(drop=True)
|
| 127 |
+
|
| 128 |
+
# Basic validation
|
| 129 |
+
if "qid" not in df.columns or "relevance" not in df.columns:
|
| 130 |
+
raise ValueError("Input CSV must contain 'qid' and 'relevance' columns.")
|
| 131 |
+
|
| 132 |
+
# Fill NaNs in label/qid (should not happen, but defensive)
|
| 133 |
+
df["qid"] = pd.to_numeric(df["qid"], errors="coerce").fillna(-1).astype(int)
|
| 134 |
+
df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(float)
|
| 135 |
+
|
| 136 |
+
# Pick numeric feature columns robustly
|
| 137 |
+
feature_cols = FEATURE_COLS.copy()
|
| 138 |
+
df = df.reindex(columns=["qid", "relevance"] + feature_cols, fill_value=0)
|
| 139 |
+
|
| 140 |
+
# Ensure numeric + finite values only (replace inf/nan with 0)
|
| 141 |
+
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")
|
| 142 |
+
df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
|
| 143 |
+
|
| 144 |
+
# Split by qid to avoid leakage across queries
|
| 145 |
+
unique_qids = df["qid"].unique()
|
| 146 |
+
if len(unique_qids) < 2:
|
| 147 |
+
warnings.warn("Only one unique qid found — ranking training may be ineffective.")
|
| 148 |
+
train_mask = np.ones(len(df), dtype=bool)
|
| 149 |
+
val_mask = np.zeros(len(df), dtype=bool)
|
| 150 |
+
else:
|
| 151 |
+
train_qids, val_qids = train_test_split(
|
| 152 |
+
unique_qids, test_size=val_ratio, random_state=random_state
|
| 153 |
+
)
|
| 154 |
+
train_mask = df["qid"].isin(train_qids)
|
| 155 |
+
val_mask = df["qid"].isin(val_qids)
|
| 156 |
+
|
| 157 |
+
# Split dataframes AFTER defining masks
|
| 158 |
+
X_train_raw = df.loc[train_mask, feature_cols]
|
| 159 |
+
y_train_raw = df.loc[train_mask, "relevance"]
|
| 160 |
+
qid_train = df.loc[train_mask, "qid"]
|
| 161 |
+
|
| 162 |
+
X_val_raw = df.loc[val_mask, feature_cols]
|
| 163 |
+
y_val_raw = df.loc[val_mask, "relevance"]
|
| 164 |
+
qid_val = df.loc[val_mask, "qid"]
|
| 165 |
+
|
| 166 |
+
# Sort by qid and build group sizes aligned with sample order (CRITICAL for XGBRanker)
|
| 167 |
+
X_train, y_train, group_train, _ = _sort_and_pack_by_qid(
|
| 168 |
+
X_train_raw, y_train_raw, qid_train, feature_cols
|
| 169 |
+
)
|
| 170 |
+
X_val, y_val, group_val, qid_val_sorted = _sort_and_pack_by_qid(
|
| 171 |
+
X_val_raw, y_val_raw, qid_val, feature_cols
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
print(f"[ranker] #Train groups: {len(group_train)} | #Val groups: {len(group_val)}")
|
| 176 |
+
print(f"[ranker] Train rows: {len(X_train)} | Val rows: {len(X_val)} | #Features: {len(feature_cols)}")
|
| 177 |
+
|
| 178 |
+
# Default model params
|
| 179 |
+
default_params = dict(
|
| 180 |
+
objective="rank:ndcg",
|
| 181 |
+
eval_metric="ndcg",
|
| 182 |
+
n_estimators=400,
|
| 183 |
+
learning_rate=0.08,
|
| 184 |
+
max_depth=6,
|
| 185 |
+
subsample=0.8,
|
| 186 |
+
colsample_bytree=0.8,
|
| 187 |
+
random_state=random_state,
|
| 188 |
+
tree_method="hist",
|
| 189 |
+
reg_lambda=1.0,
|
| 190 |
+
reg_alpha=0.0,
|
| 191 |
+
)
|
| 192 |
+
if model_params:
|
| 193 |
+
default_params.update(model_params)
|
| 194 |
+
|
| 195 |
+
model = XGBRanker(**default_params)
|
| 196 |
+
|
| 197 |
+
# Fit model (XGBRanker requires group/group for eval_set as well)
|
| 198 |
+
fit_kwargs = dict(
|
| 199 |
+
X=X_train,
|
| 200 |
+
y=y_train,
|
| 201 |
+
group=group_train,
|
| 202 |
+
eval_set=[(X_val, y_val)],
|
| 203 |
+
eval_group=[group_val],
|
| 204 |
+
verbose=False,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
try:
|
| 208 |
+
# Newer xgboost versions (some builds) support early_stopping_rounds on Ranker
|
| 209 |
+
model.fit(early_stopping_rounds=50, **fit_kwargs) # maximize=True is inferred by 'ndcg'
|
| 210 |
+
except TypeError:
|
| 211 |
+
# Fallback to callback API (older versions)
|
| 212 |
+
try:
|
| 213 |
+
from xgboost.callback import EarlyStopping
|
| 214 |
+
model.fit(callbacks=[EarlyStopping(rounds=50, save_best=True, maximize=True)], **fit_kwargs)
|
| 215 |
+
except Exception:
|
| 216 |
+
# Last resort: train without early stopping
|
| 217 |
+
model.fit(**fit_kwargs)
|
| 218 |
+
|
| 219 |
+
# Evaluate mean NDCG@5/10
|
| 220 |
+
metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
|
| 221 |
+
|
| 222 |
+
print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
|
| 223 |
+
|
| 224 |
+
# Evaluate mean NDCG@5/10
|
| 225 |
+
metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
|
| 226 |
+
print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
|
| 227 |
+
|
| 228 |
+
# === Save NDCG metrics to log ===
|
| 229 |
+
from datetime import datetime
|
| 230 |
+
log_path = os.path.join(base_dir, "training_log.txt")
|
| 231 |
+
with open(log_path, "a", encoding="utf-8") as f:
|
| 232 |
+
ndcg5 = metrics.get("NDCG@5", 0.0)
|
| 233 |
+
ndcg10 = metrics.get("NDCG@10", 0.0)
|
| 234 |
+
f.write(f"{datetime.now().isoformat()} | NDCG@5={ndcg5:.4f}, NDCG@10={ndcg10:.4f}\n")
|
| 235 |
+
print(f"[ranker] Logged metrics to {log_path}")
|
| 236 |
+
|
| 237 |
+
# Save model
|
| 238 |
+
model_path = os.path.join(base_dir, "ranker.pkl")
|
| 239 |
+
joblib.dump(model, model_path)
|
| 240 |
+
print(f"[ranker] Model saved to {model_path}")
|
| 241 |
+
|
| 242 |
+
# Save model
|
| 243 |
+
if save_model:
|
| 244 |
+
model_path = os.path.join(base_dir, "ranker.pkl")
|
| 245 |
+
joblib.dump(model, model_path)
|
| 246 |
+
print(f"[ranker] Model saved to {model_path}")
|
| 247 |
+
|
| 248 |
+
return model, metrics, feature_cols
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
if __name__ == "__main__":
|
| 254 |
+
# Example run
|
| 255 |
+
train_model_ranker(
|
| 256 |
+
user_id="user_1",
|
| 257 |
+
save_model=True,
|
| 258 |
+
val_ratio=0.2,
|
| 259 |
+
random_state=42,
|
| 260 |
+
max_rows=None, # or set an upper bound for quick iterations, e.g., 200_000
|
| 261 |
+
model_params=None, # override defaults if desired
|
| 262 |
+
)
|