Spaces:

Iris314
/

SmartFridgeRecipes

Sleeping

App Files Files Community

Iris314 commited on Oct 11, 2025

Commit

b9a4372

verified ·

1 Parent(s): dce0f27

Upload 8 files

Browse files

Files changed (8) hide show

recipe_recommendation/src/__init__.py +0 -0
recipe_recommendation/src/candidate.py +346 -0
recipe_recommendation/src/coldstart.py +387 -0
recipe_recommendation/src/embedding.py +100 -0
recipe_recommendation/src/feature.py +257 -0
recipe_recommendation/src/highlight.py +91 -0
recipe_recommendation/src/io.py +37 -0
recipe_recommendation/src/trainmodel.py +262 -0

recipe_recommendation/src/__init__.py ADDED Viewed

File without changes

recipe_recommendation/src/candidate.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import pandas as pd
+import numpy as np
+from .feature import extract_features
+from .io import load_ingredient_map
+import joblib
+# Load ingredient map globally to avoid repeated I/O
+INGREDIENT_MAP = load_ingredient_map()
+PARENTS = INGREDIENT_MAP["parents"]
+CHILDREN = INGREDIENT_MAP["children"]
+def extract_user_parents(user_ingredients):
+    """Map user's ingredients to parent categories"""
+    user_parents = set()
+    for ing in user_ingredients:
+        ing_lower = ing.lower().strip()
+        if ing_lower in CHILDREN:
+            parent = CHILDREN[ing_lower]["parent"]
+            user_parents.add(parent)
+        elif ing_lower in PARENTS:
+            user_parents.add(ing_lower)
+    return user_parents
+# def hard_filter(recipe, user_profile):
+#     diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
+#     if diet == "vegan" and not recipe.get("is_vegan_safe", True):
+#         return False
+#     if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
+#         return False
+#     return True
+def hard_filter(recipe: dict, user_profile: dict, debug=False) -> bool:
+    """
+    Minimal hard filter: only vegan/vegetarian & disliked main.
+    """
+    recipe_name = recipe.get("name", "Unknown")
+    # --- Dietary filter ---
+    diet = user_profile.get("diet", {}).get("vegetarian_type", "").lower()
+    if diet == "vegan" and not recipe.get("is_vegan_safe", True):
+        if debug:
+            print(f"❌ {recipe_name}: Not vegan-safe")
+        return False
+    if diet in ["vegetarian", "flexible_vegetarian"] and not recipe.get("is_vegetarian_safe", True):
+        if debug:
+            print(f"❌ {recipe_name}: Not vegetarian-safe")
+        return False
+    # --- Disliked main ingredients filter ---
+    disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
+    if disliked_main:
+        recipe_main = recipe.get("main_parent", set())
+        if isinstance(recipe_main, list):
+            recipe_main = set(recipe_main)
+        elif not isinstance(recipe_main, set):
+            recipe_main = set()
+        overlap = recipe_main & disliked_main
+        if overlap:
+            if debug:
+                print(f"❌ {recipe_name}: Contains disliked {overlap}")
+            return False
+    if debug:
+        print(f"✅ {recipe_name}: PASS hard filter")
+    return True
+COARSE_WEIGHTS = {
+    "main_match_ratio": 1.0,
+    "staple_match_ratio": 0.3,
+    "other_match_ratio": 0.6,
+    "low_calorie_penalty": 0.2,
+    "preferred_course_overlap": 0.1,
+    "region_match": 0.8,
+    "preferred_main_overlap": 1
+}
+def coarse_score(features, weights=COARSE_WEIGHTS):
+    score = 0.0
+    for key, w in weights.items():
+        if key in features:
+            score += w * features[key]
+    return score
+def coarse_rank_candidates(recipes, user_parents, user_profile, top_n=30000, weights=COARSE_WEIGHTS):
+    """
+    Stage 2: Coarse Ranking (NumPy vectorized implementation)
+    ---------------------------------------------------------
+    Quickly retrieves a subset of candidate recipes by computing
+    ingredient coverage ratios (main / staple / other) between
+    the user's pantry and the recipes using vectorized operations.
+    This function replaces the original Python loop version
+    for significant speedup during cold start and real-time ranking.
+    """
+    if not recipes:
+        return []
+    # === 1. Build parent vocabulary ===
+    # Extract all unique parent ingredients across main/staple/other fields.
+    all_parents = sorted({
+        p for r in recipes
+        for k in ["main_parent", "staple_parent", "other_parent"]
+        for p in (r.get(k) or [])
+    })
+    parent_index = {p: i for i, p in enumerate(all_parents)}
+    num_recipes = len(recipes)
+    num_parents = len(all_parents)
+    # === 2. Construct multi-hot matrices for main, staple, other ===
+    # Each row corresponds to a recipe; each column to a parent ingredient.
+    main_mat   = np.zeros((num_recipes, num_parents), dtype=np.uint8)
+    staple_mat = np.zeros((num_recipes, num_parents), dtype=np.uint8)
+    other_mat  = np.zeros((num_recipes, num_parents), dtype=np.uint8)
+    for i, r in enumerate(recipes):
+        for p in r.get("main_parent", []):
+            if p in parent_index:
+                main_mat[i, parent_index[p]] = 1
+        for p in r.get("staple_parent", []):
+            if p in parent_index:
+                staple_mat[i, parent_index[p]] = 1
+        for p in r.get("other_parent", []):
+            if p in parent_index:
+                other_mat[i, parent_index[p]] = 1
+    # === 3. Encode user pantry as a binary mask ===
+    user_mask = np.zeros(num_parents, dtype=np.uint8)
+    for p in user_parents:
+        if p in parent_index:
+            user_mask[parent_index[p]] = 1
+    # === 4. Compute ingredient match ratios in batch ===
+    # main_ratio = (# of matched main ingredients) / (# of total main ingredients)
+    main_total   = main_mat.sum(axis=1)
+    staple_total = staple_mat.sum(axis=1)
+    other_total  = other_mat.sum(axis=1)
+    main_match   = (main_mat @ user_mask)
+    staple_match = (staple_mat @ user_mask)
+    other_match  = (other_mat @ user_mask)
+    main_ratio   = main_match   / np.maximum(main_total, 1)
+    staple_ratio = staple_match / np.maximum(staple_total, 1)
+    other_ratio  = other_match  / np.maximum(other_total, 1)
+    # === 5. Additional coarse ranking signals ===
+    # Low-calorie preference & preferred cuisine overlap
+    calories = np.array([r.get("calories", 0) for r in recipes], dtype=float)
+    calorie_threshold = user_profile.get("calorie_threshold", 9999)
+    low_calorie_penalty = (calories <= calorie_threshold).astype(float)
+    preferred_course_types = set(user_profile.get("preferred_course_types", []))
+    preferred_overlap = np.array([
+        len(set(r.get("cuisine_attr", [])) & preferred_course_types)
+        for r in recipes
+    ], dtype=float)
+    # Region preference matching
+    preferred_regions = set(user_profile.get("region_preference", []))
+    region_match = np.array([
+        1.0 if any(region in preferred_regions for region in
+                (r.get("region", []) if isinstance(r.get("region"), (list, set))
+                    else [r.get("region", "")]))
+        else 0.0
+        for r in recipes
+    ], dtype=float)
+    # === Preferred main ingredients ===
+    preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
+    if preferred_main:
+        preferred_main_overlap = np.array([
+            len(set(r.get("main_parent", [])) & preferred_main)
+            for r in recipes
+        ], dtype=float)
+        # print(f"[coarse_rank] Preferred main: {preferred_main}, matches: {np.sum(preferred_main_overlap > 0)}")
+    else:
+        preferred_main_overlap = np.zeros(len(recipes))
+    # === 6. Compute coarse ranking scores ===
+    scores = (
+        weights["main_match_ratio"]   * main_ratio +
+        weights["staple_match_ratio"] * staple_ratio +
+        weights["other_match_ratio"]  * other_ratio +
+        weights["low_calorie_penalty"] * low_calorie_penalty +
+        weights["preferred_course_overlap"] * preferred_overlap +
+        weights.get("region_match", 0) * region_match +
+        weights.get("preferred_main_overlap", 0) * preferred_main_overlap
+    )
+    # === 7. Select top-N candidates ===
+    valid_idx = np.where(scores > 0)[0]
+    if valid_idx.size == 0:
+        return []
+    scores_valid = scores[valid_idx]
+    topk = min(top_n, valid_idx.size)
+    # Optional dynamic thresholding: keep candidates with score >= 50% of max
+    max_score = scores_valid.max()
+    keep_mask = scores_valid >= max_score * 0.5
+    keep_idx = valid_idx[keep_mask]
+    if keep_idx.size == 0:
+        return []
+    order = np.argsort(scores[keep_idx])[::-1]
+    top_idx = keep_idx[order[:topk]]
+    # Return the original recipe dicts corresponding to the top candidates
+    return [recipes[i] for i in top_idx]
+def rule_generate_candidates(df, user_parents, user_profile):
+    """
+    Step 3: Rule-based reranking of coarse candidates (vectorized).
+    This replaces the slow df.apply(score) loop with one-shot feature extraction.
+    """
+    if df.empty:
+        return df
+    recipes_for_inference = []
+    for _, row in df.iterrows():
+        recipes_for_inference.append({
+            "main": row.get("main_parent", set()),
+            "staple": row.get("staple_parent", set()),
+            "other": row.get("other_parent", set()),
+            "seasoning": row.get("seasoning_parent", set()),
+            "matched_main": len(row.get("main_parent", set()) & set(user_parents)),
+            "matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
+            "matched_other": len(row.get("other_parent", set()) & set(user_parents)),
+            "calories": row.get("calories", 0),
+            "protein": row.get("protein", 0),
+            "fat": row.get("fat", 0),
+            "region": row.get("region", ""),
+            "cuisine_attr": row.get("cuisine_attr", []),
+            "ingredients": row.get("ingredients", []),
+            "minutes": row.get("minutes", None),
+        })
+    feats_df = extract_features(recipes_for_inference, user_profile)
+    scores = (
+        2.0 * feats_df["main_match_ratio"] +
+        1.0 * feats_df["staple_match_ratio"] +
+        1.0 * feats_df["other_match_ratio"]
+    )
+    if user_profile.get("low_calorie", False):
+        scores += 0.5 * feats_df["low_calorie_penalty"]
+    if user_profile.get("high_protein", False):
+        scores += 0.3 * (feats_df["protein_ratio"] > 0.25)
+    if user_profile.get("low_fat", False):
+        scores -= 0.3 * (feats_df["fat_ratio"] > 0.35)
+    scores += 0.5 * feats_df["region_match"]
+    scores += 0.4 * feats_df["preferred_course_overlap"]
+    scores += 0.3 * feats_df["preferred_main_overlap"]
+    scores += 0.3 * feats_df["within_cooking_time"]
+    scores -= 0.2 * feats_df["missing_main_count"]
+    df = df.copy()
+    df["match_score"] = np.maximum(scores, 0.0)
+    df = df[df["match_score"] > 0]
+    if df.empty:
+        return df
+    df = df.sort_values("match_score", ascending=False).reset_index(drop=True)
+    return df
+def ml_generate_candidates(coarse_candidates, user_parents, user_profile, model_path, topk=5):
+    """
+    Step 3: ML-based reranking (directly after Step 2).
+    Instead of rule-based prefiltering, use the coarse-ranked candidates (Step 2 output),
+    build features in the same format as training, and apply the trained ML model to rerank.
+    """
+    # Handle empty input
+    if coarse_candidates is None or len(coarse_candidates) == 0:
+        print("No candidates provided for ML reranking.")
+        return pd.DataFrame()
+    # If input is a list of dicts (from coarse_rank_candidates), convert to DataFrame
+    if isinstance(coarse_candidates, list):
+        df = pd.DataFrame(coarse_candidates)
+    else:
+        df = coarse_candidates.copy()
+    if df.empty:
+        print("Coarse candidates DataFrame is empty.")
+        return df
+    # Load trained model
+    model = joblib.load(model_path)
+    # Build feature DataFrame
+    recipes_for_inference = []
+    for _, row in df.iterrows():
+        recipes_for_inference.append({
+            "main": row.get("main_parent", set()),
+            "staple": row.get("staple_parent", set()),
+            "other": row.get("other_parent", set()),
+            "seasoning": row.get("seasoning_parent", set()),
+            "matched_main": len(row.get("main_parent", set()) & set(user_parents)),
+            "matched_staple": len(row.get("staple_parent", set()) & set(user_parents)),
+            "matched_other": len(row.get("other_parent", set()) & set(user_parents)),
+            "calories": row.get("calories", 0),
+            "protein": row.get("protein", 0),
+            "fat": row.get("fat", 0),
+            "region": row.get("region", ""),
+            "cuisine_attr": row.get("cuisine_attr", []),
+            "ingredients": row.get("ingredients", []),
+            "minutes": row.get("minutes", None),
+        })
+    feature_df = extract_features(recipes_for_inference, user_profile)
+    # Predict ML scores
+    if hasattr(model, "predict_proba"):
+        df["ml_score"] = model.predict_proba(feature_df)[:, 1]
+    else:
+        df["ml_score"] = model.predict(feature_df)
+    # normalize to 0-1
+    if len(df) > 0 and df["ml_score"].max() > df["ml_score"].min():
+        df["ml_score"] = (df["ml_score"] - df["ml_score"].min()) / (df["ml_score"].max() - df["ml_score"].min())
+    # Sort by ML score and return top-k candidates
+    return df.sort_values("ml_score", ascending=False).head(topk).reset_index(drop=True)

recipe_recommendation/src/coldstart.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import os
+import ast
+import json
+import random
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import warnings
+from .candidate import coarse_rank_candidates, hard_filter, rule_generate_candidates
+from .feature import build_features
+from .io import load_recipes_csv, load_ingredient_map
+RECIPES_PATH = load_recipes_csv()
+INGREDIENT_MAP = load_ingredient_map()
+PARENTS = INGREDIENT_MAP["parents"]
+CHILDREN = INGREDIENT_MAP["children"]
+def parse_list(x):
+    """Convert a stringified list into a Python list safely."""
+    if pd.isna(x) or x == "":
+        return []
+    if isinstance(x, list):
+        return x
+    try:
+        return ast.literal_eval(x)
+    except Exception:
+        return []
+def parse_set(x):
+    """Convert a stringified collection into a Python set safely."""
+    if pd.isna(x) or x == "":
+        return set()
+    if isinstance(x, set):
+        return x
+    if isinstance(x, (list, tuple)):
+        return set(x)
+    if isinstance(x, str):
+        try:
+            v = ast.literal_eval(x)
+            if isinstance(v, (list, tuple, set)):
+                return set(v)
+            return {v}
+        except Exception:
+            return {x.strip()}
+    return {x}
+def _parents_pool_from_df(df: pd.DataFrame):
+    cols = ["main_parent", "staple_parent", "other_parent", "seasoning_parent"]
+    pool = set()
+    for c in cols:
+        if c in df.columns:
+            for s in df[c]:
+                pool |= set(s) if isinstance(s, (set, list, tuple)) else set()
+    return sorted(pool)
+def sample_user_parents(parents_pool,
+                        user_profile=None,
+                        prev_inventory=None,
+                        min_items=3, max_items=10,
+                        keep_ratio=0.6, reset_interval=20, round_idx=0):
+    liked = set((user_profile or {}).get("other_preferences", {}).get("preferred_main", []))
+    disliked = set((user_profile or {}).get("other_preferences", {}).get("disliked_main", []))
+    forbidden = set((user_profile or {}).get("forbidden_parents", [])) | disliked
+    pool, weights = [], []
+    for p in parents_pool:
+        if p in forbidden:
+            continue
+        w = 3.0 if p in liked else 1.0
+        pool.append(p); weights.append(w)
+    if not pool:
+        pool, weights = parents_pool[:], [1.0] * len(parents_pool)
+    inventory = set()
+    force_reset = (round_idx % reset_interval == 0)
+    if prev_inventory and not force_reset:
+        prev_list = list(prev_inventory); random.shuffle(prev_list)
+        keep_k = max(0, int(len(prev_list) * keep_ratio))
+        inventory |= set(prev_list[:keep_k])
+    k = random.randint(min_items, max_items)
+    remain = max(0, k - len(inventory))
+    for _ in range(min(remain, len(pool))):
+        idx = random.choices(range(len(pool)), weights=weights, k=1)[0]
+        inventory.add(pool[idx])
+    return list(inventory)
+def _weighted_pick3(indexes, scores, temperature=1.0):
+    idxs = list(indexes)
+    scs = np.array(scores, dtype=float)
+    if np.any(scs < 0):
+        scs = scs - scs.min()
+    if scs.sum() == 0:
+        scs = np.ones_like(scs)
+    picks = []
+    for _ in range(min(3, len(idxs))):
+        probs = np.exp(scs / max(temperature, 1e-6))
+        probs = probs / probs.sum()
+        choice = np.random.choice(len(idxs), p=probs)
+        picks.append(idxs[choice])
+        idxs.pop(choice)
+        scs = np.delete(scs, choice)
+        if len(idxs) == 0:
+            break
+    return picks
+# ---------- Main cold-start ----------
+# ---------- Main cold-start ----------
+def cold_start_ranker(user_id: str,
+                      n_rounds: int = 2000,
+                      topn_coarse: int = 5000,
+                      topk_rule: int = 3,
+                      batch_size: int = 5000,
+                      switch_interval: int = 100):
+    """
+    Cold-start data generation for learning-to-rank.
+    Top-5 selection prioritizes user pantry coverage deterministically:
+    1. Fully covered recipes first (missing_count == 0)
+    2. Then few missing (esp. staple/other)
+    3. Heavy penalty for missing main ingredients.
+    """
+    base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
+    if not os.path.exists(base_dir):
+        base_dir = os.path.join("recipe_recommendation", "input_user_data", user_id)
+    if not os.path.exists(base_dir):
+        raise FileNotFoundError(
+            f"❌ User profile not found for '{user_id}' in either 'recipe_recommendation/user_data' or 'recipe_recommendation/input_user_data'."
+        )
+    print(f"[cold_start_ranker] Using base_dir = {base_dir}")
+    profile_path  = os.path.join(base_dir, "user_profile.json")
+    features_path = os.path.join(base_dir, "user_features_rank.csv")
+    if os.path.exists(features_path):
+        print(f"[cold_start] Features already exist at {features_path}")
+        return features_path
+    with open(profile_path, "r", encoding="utf-8") as f:
+        user_profile = json.load(f)
+    # Load and parse recipes
+    df_all = pd.read_csv(RECIPES_PATH)
+    to_set = ["main_parent", "staple_parent", "other_parent", "seasoning_parent", "cuisine_attr"]
+    to_list = ["ingredients"]
+    for c in to_set:
+        if c in df_all.columns:
+            df_all[c] = df_all[c].apply(parse_set)
+    for c in to_list:
+        if c in df_all.columns:
+            df_all[c] = df_all[c].apply(parse_list)
+    # Step 1 hard filter
+    if hard_filter is not None:
+        try:
+            before = len(df_all)
+            mask = df_all.apply(lambda r: hard_filter(r.to_dict(), user_profile), axis=1)
+            df_all = df_all[mask]
+            after = len(df_all)
+            print(f"[cold_start] Step1 hard filter applied: {before} -> {after}")
+        except Exception as e:
+            warnings.warn(f"[cold_start] hard_filter failed, skip. err={e}")
+    n_chunks = (len(df_all) // batch_size) + 1
+    chunks = np.array_split(df_all, n_chunks)
+    parents_pool = _parents_pool_from_df(df_all)
+    rows = []
+    prev_inventory = None
+    for i in tqdm(range(n_rounds), desc="Cold-start rounds"):
+        chunk_id = (i // switch_interval) % n_chunks
+        df_chunk = chunks[chunk_id].copy()
+        # pantry sampling
+        user_parents = sample_user_parents(
+            parents_pool,
+            user_profile=user_profile,
+            prev_inventory=prev_inventory,
+            round_idx=i
+        )
+        prev_inventory = user_parents
+        # Step 2: coarse recall
+        coarse_list = coarse_rank_candidates(
+            recipes=df_chunk.to_dict(orient="records"),
+            user_parents=user_parents,
+            user_profile=user_profile,
+            top_n=min(topn_coarse, len(df_chunk))
+        )
+        if not coarse_list:
+            continue
+        coarse_df = pd.DataFrame(coarse_list)
+        # Step 3: rule rerank → Top-5 candidates (just for selecting the 5)
+        rule_df = rule_generate_candidates(
+            coarse_df,
+            user_parents=user_parents,
+            user_profile=user_profile
+        )
+        if rule_df.empty or len(rule_df) < topk_rule:
+            continue
+        top5 = rule_df.head(topk_rule).copy()
+        # ===== Deterministic scoring with feasibility + region + soft constraints =====
+        user_set = set(user_parents)
+        scored_candidates = []
+        # Nutrition goals (from profile)
+        ng = user_profile.get("nutritional_goals", {})
+        cal_min = ng.get("calories", {}).get("min", 0)
+        cal_max = ng.get("calories", {}).get("max", 1e9)
+        pro_min = ng.get("protein", {}).get("min", 0)
+        pro_max = ng.get("protein", {}).get("max", 1e9)
+        # Preferences
+        liked = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
+        disliked = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
+        max_cooking_time = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
+        for idx, row in top5.iterrows():
+            main_set   = set(row.get("main_parent", set()))
+            staple_set = set(row.get("staple_parent", set()))
+            other_set  = set(row.get("other_parent", set()))
+            main_total   = len(main_set)
+            staple_total = len(staple_set)
+            main_match   = len(main_set   & user_set)
+            staple_match = len(staple_set & user_set)
+            # === 1) Feasibility check ===
+            total_needed = max(1, main_total + staple_total)
+            total_have   = main_match + staple_match
+            coverage_ratio = total_have / total_needed
+            if coverage_ratio < 0.5:
+                continue
+            # === 2) Region preference ===
+            region_score = 1.0 if row.get("region_match", 0) else 0.0
+            # === 3) Cooking time soft constraint ===
+            time_val = row.get("minutes", None)
+            time_score = 0.0
+            if max_cooking_time and time_val is not None:
+                try:
+                    t_val = float(time_val)
+                    t_max = float(max_cooking_time)
+                    lower_bound = 0.8 * t_max
+                    upper_bound = 1.2 * t_max
+                    if lower_bound <= t_val <= upper_bound:
+                        time_score = 1.0
+                    else:
+                        deviation = abs(t_val - t_max) / t_max
+                        time_score = max(0.0, 1.0 - deviation)
+                except (TypeError, ValueError):
+                    time_score = 0.0
+            else:
+                time_score = 1.0
+            # === 4) Calories soft constraint ===
+            cal_val = row.get("calories", None)
+            cal_score = 1.0
+            if cal_val is not None and cal_min < cal_max:
+                try:
+                    c_val = float(cal_val)
+                    cal_center = 0.5 * (cal_min + cal_max)
+                    tol = 0.3 * cal_center
+                    lower_bound = cal_center - tol
+                    upper_bound = cal_center + tol
+                    if lower_bound <= c_val <= upper_bound:
+                        cal_score = 1.0
+                    else:
+                        deviation = abs(c_val - cal_center) / cal_center
+                        cal_score = max(0.0, 1.0 - deviation)
+                except (TypeError, ValueError):
+                    cal_score = 0.0
+            # === 4b) Protein soft constraint ===
+            protein_val = row.get("protein", None)
+            protein_score = 1.0
+            if protein_val is not None and pro_min < pro_max:
+                try:
+                    p_val = float(protein_val)
+                    pro_center = 0.5 * (pro_min + pro_max)
+                    tol = 0.2 * pro_center
+                    lower_bound = pro_center - tol
+                    upper_bound = pro_center + tol
+                    if lower_bound <= p_val <= upper_bound:
+                        protein_score = 1.0
+                    else:
+                        deviation = abs(p_val - pro_center) / pro_center
+                        protein_score = max(0.0, 1.0 - deviation)
+                except (TypeError, ValueError):
+                    protein_score = 0.0
+            # === 5) Liked / Disliked main ===
+            like_bonus = 1.0 if main_set & liked else 0.0
+            dislike_penalty = 1.0 if main_set & disliked else 0.0
+            # === 6) Final scoring ===
+            score = (
+                    0.5  * coverage_ratio +
+                    0.15 * region_score +
+                    0.1  * time_score +
+                    0.1  * cal_score +
+                    0.05 * protein_score +
+                    0.05 * like_bonus -
+                    0.05 * dislike_penalty
+                )
+            scored_candidates.append((idx, score))
+        # Sort and pick top3 for relevance
+        scored_candidates.sort(key=lambda x: x[1], reverse=True)
+        picked_idxs = [idx for idx, _ in scored_candidates[:3]]
+        # relevance labels 3 / 2 / 1
+        labels = {idx: 0 for idx in top5.index}
+        if len(picked_idxs) > 0:
+            labels[picked_idxs[0]] = 3
+        if len(picked_idxs) > 1:
+            labels[picked_idxs[1]] = 2
+        if len(picked_idxs) > 2:
+            labels[picked_idxs[2]] = 1
+        # build features for all 5 candidates
+        for idx, row in top5.iterrows():
+            up = set(user_parents)
+            main_set   = set(row.get("main_parent", set()))
+            staple_set = set(row.get("staple_parent", set()))
+            other_set  = set(row.get("other_parent", set()))
+            recipe_dict = {
+                "main": main_set,
+                "staple": staple_set,
+                "other": other_set,
+                "seasoning": set(row.get("seasoning_parent", set())),
+                "matched_main":   len(main_set   & up),
+                "matched_staple": len(staple_set & up),
+                "matched_other":  len(other_set  & up),
+                "calories": row.get("calories", 0),
+                "protein":  row.get("protein", 0),
+                "fat":      row.get("fat", 0),
+                "region": row.get("region", ""),
+                "cuisine_attr": row.get("cuisine_attr", []),
+                "ingredients": row.get("ingredients", []),
+                "minutes": row.get("minutes", None),
+            }
+            feats = build_features(recipe_dict, user_profile)
+            feats["relevance"] = float(labels[idx])
+            feats["qid"] = int(i)
+            rows.append(feats)
+    out = pd.DataFrame(rows)
+    if "qid" not in out.columns or out.empty:
+        print(f"[cold_start] No valid training data generated for {user_id}, skipping save.")
+        return None
+    valid_qids = out.groupby("qid").size()
+    keep_qids = valid_qids[valid_qids > 1].index
+    out = out[out["qid"].isin(keep_qids)].reset_index(drop=True)
+    os.makedirs(base_dir, exist_ok=True)
+    out_path = os.path.join(base_dir, "user_features_rank.csv")
+    out.to_csv(out_path, index=False)
+    print(f"[cold_start] Saved {len(out)} rows to {out_path}")
+    return out_path
+if __name__ == "__main__":
+    cold_start_ranker(
+        user_id="user_1",
+        n_rounds=10000,
+        topn_coarse=20000,
+        topk_rule=5,
+        coverage_penalty=0.15,
+        temperature=0.5
+    )

recipe_recommendation/src/embedding.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import json
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+def profile_to_embedding(profile):
+    """
+    Convert a normalized user profile into a fixed-length numeric embedding.
+    Embedding structure:
+    [diet (3)] + [allergies (6)] + [region (6)] +
+    [nutritional goals (4)] + [preferred_main (8)] + [cooking_time (1)]
+    Total dim ≈ 28
+    """
+    vecs = []
+    # 1. Diet (one-hot)
+    diet_types = ["vegetarian", "flexible", "non_vegetarian"]
+    diet_vec = np.zeros(len(diet_types))
+    diet_value = profile.get("diet", {}).get("vegetarian_type", "flexible")
+    if diet_value in diet_types:
+        diet_vec[diet_types.index(diet_value)] = 1
+    vecs.append(diet_vec)
+    # 2. Allergies (multi-hot)
+    allergy_vocab = ["milk", "gluten", "peanut", "shrimp", "egg", "soy"]
+    allergies = set(profile.get("allergies", []))
+    allergy_vec = np.array([1 if a in allergies else 0 for a in allergy_vocab])
+    vecs.append(allergy_vec)
+    # 3. Region preferences (multi-hot)
+    region_vocab = ["North America", "Latin America", "Europe", "Asia", "Middle East", "Africa"]
+    regions = set(profile.get("region_preference", []))
+    region_vec = np.array([1 if r in regions else 0 for r in region_vocab])
+    vecs.append(region_vec)
+    # 4. Nutritional goals (normalized)
+    ng = profile.get("nutritional_goals", {})
+    cal = ng.get("calories", {})
+    pro = ng.get("protein", {})
+    cal_min = cal.get("min", 0) / 4000
+    cal_max = min(cal.get("max", 9999), 4000) / 4000
+    pro_min = pro.get("min", 0) / 300
+    pro_max = min(pro.get("max", 999), 300) / 300
+    vecs.append(np.array([cal_min, cal_max, pro_min, pro_max]))
+    # 5. Preferred main ingredients (multi-hot)
+    main_vocab = ["chicken", "tofu", "beef", "salmon", "eggs", "pork", "beans", "mushroom"]
+    mains = set(profile.get("other_preferences", {}).get("preferred_main", []))
+    main_vec = np.array([1 if m in mains else 0 for m in main_vocab])
+    vecs.append(main_vec)
+    # 6. Cooking time max (normalized to [0,1], assume 120 min upper bound)
+    t = profile.get("other_preferences", {}).get("cooking_time_max")
+    t_vec = np.array([min(t / 120, 1)]) if t is not None else np.array([0])
+    vecs.append(t_vec)
+    return np.concatenate(vecs)
+def profile_similarity(profile_a, profile_b):
+    """Compute cosine similarity between two user profiles."""
+    emb_a = profile_to_embedding(profile_a).reshape(1, -1)
+    emb_b = profile_to_embedding(profile_b).reshape(1, -1)
+    return cosine_similarity(emb_a, emb_b)[0, 0]
+def find_most_similar_user(target_user_id, user_data_dir="recipe_recommendation/user_data", threshold=0.85):
+    """
+    Find the most similar existing user based on profile embeddings.
+    Returns (best_match_user_id, similarity_score) or (None, -1) if no match.
+    """
+    target_profile_path = os.path.join(user_data_dir, target_user_id, "user_profile.json")
+    if not os.path.exists(target_profile_path):
+        raise FileNotFoundError(f"[embedding] No profile found for user {target_user_id}")
+    with open(target_profile_path, "r", encoding="utf-8") as f:
+        target_profile = json.load(f)
+    target_emb = profile_to_embedding(target_profile).reshape(1, -1)
+    best_match, best_score = None, -1
+    for uid in os.listdir(user_data_dir):
+        if uid == target_user_id:
+            continue
+        profile_path = os.path.join(user_data_dir, uid, "user_profile.json")
+        if not os.path.exists(profile_path):
+            continue
+        with open(profile_path, "r", encoding="utf-8") as f:
+            other_profile = json.load(f)
+        other_emb = profile_to_embedding(other_profile).reshape(1, -1)
+        sim = cosine_similarity(target_emb, other_emb)[0, 0]
+        if sim > best_score:
+            best_match, best_score = uid, sim
+    if best_match and best_score >= threshold:
+        print(f"[embedding] Found similar user: {best_match} (similarity={best_score:.3f})")
+        return best_match, best_score
+    return None, -1

recipe_recommendation/src/feature.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import json
+from .io import load_ingredient_map
+import numpy as np
+import pandas as pd
+# Load ingredient map globally to avoid repeated I/O
+INGREDIENT_MAP = load_ingredient_map()
+PARENTS = INGREDIENT_MAP["parents"]
+CHILDREN = INGREDIENT_MAP["children"]
+FEATURE_COLS = [
+    "main_match_ratio", "other_match_ratio", "staple_match_ratio",
+    "missing_main_count", "missing_other_count", "missing_staple_count",
+    "calories", "protein", "fat", "protein_ratio", "fat_ratio",
+    "region_match",
+    "is_vegan_safe", "is_vegetarian_safe_absolute", "is_flexible_safe_absolute", "is_user_diet_safe",
+    "preferred_main_overlap", "disliked_main_overlap",
+    "preferred_course_overlap",
+    "within_cooking_time", "cooking_time_over",
+    "calories_value", "calories_deviation",
+    "protein_value", "protein_deviation",
+]
+def extract_features(recipes: list[dict], user_profile: dict):
+    """
+    Convert a list of recipes into a feature matrix for ML model.
+    Ensures feature columns are aligned with FEATURE_COLS.
+    """
+    rows = [build_features(r, user_profile) for r in recipes]
+    df = pd.DataFrame(rows)
+    df = df.reindex(columns=FEATURE_COLS, fill_value=0)
+    assert list(df.columns) == FEATURE_COLS, "Feature columns mismatch!"
+    return df
+def is_recipe_vegetarian_safe(ingredients: list[str], veg_type: str) -> bool:
+    """
+    Check if the recipe is safe for a given dietary type.
+    Supported veg_type: "vegan", "vegetarian", "flexible_vegetarian", "" (none).
+    """
+    for ing in ingredients:
+        ing_lower = ing.strip().lower()
+        if ing_lower in CHILDREN:
+            info = CHILDREN[ing_lower]
+        elif ing_lower in PARENTS:
+            info = PARENTS[ing_lower]
+        else:
+            # If the ingredient is not found in the map, treat it as safe by default.
+            continue
+        if veg_type == "vegan" and not info.get("vegan_safe", True):
+            return False
+        if veg_type == "vegetarian" and not info.get("vegetarian_safe", True):
+            return False
+        if veg_type == "flexible_vegetarian":
+            # Flexible vegetarians allow most ingredients except explicit meat.
+            # Here, we can use vegetarian_safe as a proxy for flexibility.
+            if not info.get("vegetarian_safe", True):
+                return False
+    return True
+def build_features(recipe: dict, user_profile: dict) -> dict:
+    """
+    Build a feature dictionary for ML ranker and rule-based scoring.
+    All features are numeric scalars or counts.
+    """
+    features = {}
+    # ======================================================
+    # 1. Ingredient matching ratios
+    # ======================================================
+    total_main = len(recipe.get("main", []))
+    total_other = len(recipe.get("other", []))
+    total_staple = len(recipe.get("staple", []))
+    features["main_match_ratio"] = recipe.get("matched_main", 0) / max(total_main, 1)
+    features["other_match_ratio"] = recipe.get("matched_other", 0) / max(total_other, 1)
+    features["staple_match_ratio"] = recipe.get("matched_staple", 0) / max(total_staple, 1)
+    features["missing_main_count"] = total_main - recipe.get("matched_main", 0)
+    features["missing_other_count"] = total_other - recipe.get("matched_other", 0)
+    features["missing_staple_count"] = total_staple - recipe.get("matched_staple", 0)
+    # ======================================================
+    # 2. Basic nutrition info
+    # ======================================================
+    calories = recipe.get("calories", 0.0) or 0.0
+    protein = recipe.get("protein", 0.0) or 0.0
+    fat = recipe.get("fat", 0.0) or 0.0
+    features["calories"] = calories
+    features["protein"] = protein
+    features["fat"] = fat
+    features["protein_ratio"] = protein / max(calories, 1)
+    features["fat_ratio"] = fat / max(calories, 1)
+    # ======================================================
+    # 3. Region preference
+    # ======================================================
+    recipe_region = recipe.get("region", [])
+    if recipe_region is None or recipe_region == "" or (isinstance(recipe_region, float) and np.isnan(recipe_region)):
+        recipe_regions = []
+    elif isinstance(recipe_region, (set, list, tuple)):
+        recipe_regions = list(recipe_region)
+    else:
+        recipe_regions = [recipe_region]
+    user_regions = user_profile.get("region_preference", [])
+    if isinstance(user_regions, str):
+        user_regions = [user_regions]
+    recipe_regions_norm = {str(r).strip().lower() for r in recipe_regions if r}
+    user_regions_norm = {str(r).strip().lower() for r in user_regions if r}
+    features["region_match"] = int(len(recipe_regions_norm & user_regions_norm) > 0)
+    # ======================================================
+    # 4. Diet constraints
+    # ======================================================
+    ingredients_all = recipe.get("ingredients", [])
+    features["is_vegan_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegan"))
+    features["is_vegetarian_safe_absolute"] = int(is_recipe_vegetarian_safe(ingredients_all, "vegetarian"))
+    features["is_flexible_safe_absolute"] = int(is_recipe_vegetarian_safe(ingredients_all, "flexible_vegetarian"))
+    veg_type = (user_profile.get("diet", {}).get("vegetarian_type", "") or "").lower()
+    features["is_user_diet_safe"] = int(is_recipe_vegetarian_safe(ingredients_all, veg_type))
+    # ======================================================
+    # 5. Preferred & disliked main
+    # ======================================================
+    recipe_main = set(recipe.get("main", []))
+    preferred_main = set(user_profile.get("other_preferences", {}).get("preferred_main", []))
+    disliked_main = set(user_profile.get("other_preferences", {}).get("disliked_main", []))
+    features["preferred_main_overlap"] = 1.0 if recipe_main & preferred_main else 0.0
+    features["disliked_main_overlap"] = 1.0 if recipe_main & disliked_main else 0.0
+    # ======================================================
+    # 6. Course type preference
+    # ======================================================
+    recipe_types = set(recipe.get("cuisine_attr", []))
+    preferred_types = set(user_profile.get("preferred_course_types", []))
+    features["preferred_course_overlap"] = len(recipe_types & preferred_types)
+    # ======================================================
+    # 7. Cooking time features
+    # ======================================================
+    max_time = user_profile.get("other_preferences", {}).get("cooking_time_max", None)
+    recipe_time = recipe.get("minutes", None)
+    if max_time is not None and recipe_time is not None:
+        try:
+            recipe_time_val = float(recipe_time)
+            max_time_val = float(max_time)
+            features["within_cooking_time"] = 1.0 if recipe_time_val <= max_time_val else 0.0
+            features["cooking_time_over"] = max(0.0, recipe_time_val - max_time_val)
+        except (TypeError, ValueError):
+            features["within_cooking_time"] = 0.0
+            features["cooking_time_over"] = 0.0
+    else:
+        features["within_cooking_time"] = 1.0
+        features["cooking_time_over"] = 0.0
+    # ======================================================
+    # 8. Calories / Protein deviation features
+    # ======================================================
+    ng = user_profile.get("nutritional_goals", {})
+    cal_min = ng.get("calories", {}).get("min", 0)
+    cal_max = ng.get("calories", {}).get("max", 1e9)
+    pro_min = ng.get("protein", {}).get("min", 0)
+    pro_max = ng.get("protein", {}).get("max", 1e9)
+    # --- Calories deviation ---
+    if calories is not None and cal_min < cal_max:
+        try:
+            cal_center = 0.5 * (cal_min + cal_max)
+            features["calories_value"] = float(calories)
+            features["calories_deviation"] = (float(calories) - cal_center) / cal_center
+        except (TypeError, ValueError):
+            features["calories_value"] = 0.0
+            features["calories_deviation"] = 0.0
+    else:
+        features["calories_value"] = 0.0
+        features["calories_deviation"] = 0.0
+    # --- Protein deviation ---
+    if protein is not None and pro_min < pro_max:
+        try:
+            pro_center = 0.5 * (pro_min + pro_max)
+            features["protein_value"] = float(protein)
+            features["protein_deviation"] = (float(protein) - pro_center) / pro_center
+        except (TypeError, ValueError):
+            features["protein_value"] = 0.0
+            features["protein_deviation"] = 0.0
+    else:
+        features["protein_value"] = 0.0
+        features["protein_deviation"] = 0.0
+    return features
+def build_cluster_features(candidates):
+    """
+    Build simple ingredient + cuisine based feature vectors for KMeans clustering.
+    This is separate from model training features.
+    Args:
+        candidates (list[dict]): list of recipe dicts.
+    Returns:
+        np.ndarray: feature matrix (num_candidates, num_features)
+    """
+    # 1. Collect vocabulary for ingredients and cuisine
+    all_main = set()
+    all_staple = set()
+    all_other = set()
+    all_cuisine = set()
+    for r in candidates:
+        all_main.update(r.get("main_parent", []) or [])
+        all_staple.update(r.get("staple_parent", []) or [])
+        all_other.update(r.get("other_parent", []) or [])
+        all_cuisine.update(r.get("cuisine_attr", []) or [])
+    main_vocab = sorted(all_main)
+    staple_vocab = sorted(all_staple)
+    other_vocab = sorted(all_other)
+    cuisine_vocab = sorted(all_cuisine)
+    # 2. Build index map
+    main_idx = {p: i for i, p in enumerate(main_vocab)}
+    staple_idx = {p: i + len(main_vocab) for i, p in enumerate(staple_vocab)}
+    other_idx = {p: i + len(main_vocab) + len(staple_vocab) for i, p in enumerate(other_vocab)}
+    cuisine_idx = {p: i + len(main_vocab) + len(staple_vocab) + len(other_vocab)
+                   for i, p in enumerate(cuisine_vocab)}
+    dim = len(main_vocab) + len(staple_vocab) + len(other_vocab) + len(cuisine_vocab)
+    X = np.zeros((len(candidates), dim), dtype=np.uint8)
+    # 3. Fill feature matrix
+    for i, r in enumerate(candidates):
+        for p in r.get("main_parent", []) or []:
+            if p in main_idx:
+                X[i, main_idx[p]] = 1
+        for p in r.get("staple_parent", []) or []:
+            if p in staple_idx:
+                X[i, staple_idx[p]] = 1
+        for p in r.get("other_parent", []) or []:
+            if p in other_idx:
+                X[i, other_idx[p]] = 1
+        for p in r.get("cuisine_attr", []) or []:
+            if p in cuisine_idx:
+                X[i, cuisine_idx[p]] = 1
+    return X

recipe_recommendation/src/highlight.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+import numpy as np
+def print_candidates(candidates, user_parents, topk=10):
+    shown = 0
+    max_score = candidates['match_score'].max()
+    min_score = candidates['match_score'].min()
+    for _, row in candidates.head(topk).iterrows():
+        scaled_score = 100 * row['match_score'] / (max_score + 1e-9)
+        print(f"{row['name']} (score {scaled_score:.1f}%)")
+        # ----- Region -----
+        region = row.get("region", None)
+        if pd.notna(region) and isinstance(region, str) and region.strip() and region.lower() != "unavailable":
+            print(f"  region: {region}")
+        # ----- Cuisine Attributes -----
+        cuisine = row.get("cuisine_attr", None)
+        if cuisine is not None and not (isinstance(cuisine, float) and pd.isna(cuisine)):
+            # Convert set to list for printing
+            if isinstance(cuisine, set):
+                cuisine = list(cuisine)
+            elif isinstance(cuisine, str):
+                cuisine = [cuisine]
+            if isinstance(cuisine, list) and len(cuisine) > 0:
+                print(f"  cuisine: {', '.join(cuisine)}")
+        # ----- Nutrition -----
+        print(f"  calories: {row.get('calories', 'N/A')}")
+        # ----- Ingredient Marking -----
+        def mark_list(lst):
+            return [("✅ " + ing) if ing in user_parents else ("❌ " + ing) for ing in lst]
+        print(f"  staple:    {mark_list(row.get('staple_parent', []))}")
+        print(f"  main:      {mark_list(row.get('main_parent', []))}")
+        print(f"  seasoning: {row.get('seasoning_parent', [])}")
+        print(f"  other:     {mark_list(row.get('other_parent', []))}")
+        print("-" * 40)
+        shown += 1
+def diversify_topk_with_min_clusters(
+    ranked_candidates,
+    feature_matrix,
+    top_k=5,
+    n_clusters=20,
+    min_clusters=3,
+    random_state=42
+):
+    """
+    Diversify top-k displayed recipes using KMeans clustering.
+    Ensures that the final top_k contains at least `min_clusters` distinct clusters.
+    """
+    if len(ranked_candidates) == 0:
+        return []
+    n_clusters = min(n_clusters, len(ranked_candidates))
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(feature_matrix)
+    # KMeans clustering
+    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=random_state)
+    cluster_ids = kmeans.fit_predict(X_scaled)
+    # Step 1: pick candidates from distinct clusters until min_clusters reached
+    picked = []
+    picked_clusters = set()
+    for i, c in enumerate(cluster_ids):
+        if c not in picked_clusters:
+            picked.append(ranked_candidates[i])
+            picked_clusters.add(c)
+        if len(picked_clusters) >= min_clusters or len(picked) >= top_k:
+            break
+    # Step 2: fill the rest purely by rank order
+    if len(picked) < top_k:
+        for i, c in enumerate(cluster_ids):
+            if ranked_candidates[i] not in picked:
+                picked.append(ranked_candidates[i])
+            if len(picked) >= top_k:
+                break
+    return picked

recipe_recommendation/src/io.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import json
+from huggingface_hub import hf_hub_download
+# Hugging Face ID
+REPO_ID = "Iris314/recipe-cleaned"
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+DATA_DIR = os.path.join(ROOT_DIR, "data")
+os.makedirs(DATA_DIR, exist_ok=True)
+def download_file(filename: str) -> str:
+    local_path = os.path.join(DATA_DIR, filename)
+    if not os.path.exists(local_path):
+        print(f"Downloading {filename} from Hugging Face Hub...")
+        hf_hub_download(
+            repo_id=REPO_ID,
+            filename=filename,
+            repo_type="dataset",
+            local_dir=DATA_DIR,
+            local_dir_use_symlinks=False
+        )
+    else:
+        print(f"{filename} already exists locally.")
+    return local_path
+def load_recipes_csv() -> str:
+    return download_file("recipes.csv")
+def load_ingredient_map() -> dict:
+    path = download_file("ingredient_map.data")
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)

recipe_recommendation/src/trainmodel.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import os
+import joblib
+import warnings
+import numpy as np
+import pandas as pd
+from typing import List, Tuple, Sequence, Optional
+from xgboost import XGBRanker
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import ndcg_score
+from pandas.api.types import is_numeric_dtype
+from .feature import FEATURE_COLS
+from datetime import datetime
+# ----------------------------- Helpers -----------------------------
+def _pick_feature_cols(df: pd.DataFrame, drop_cols: Sequence[str]) -> List[str]:
+    """
+    Pick numeric feature columns robustly, excluding drop_cols.
+    Uses pandas is_numeric_dtype to correctly include nullable ints/floats/bools.
+    """
+    cols = []
+    for c in df.columns:
+        if c in drop_cols:
+            continue
+        if is_numeric_dtype(df[c]):
+            cols.append(c)
+    return cols
+def _sort_and_pack_by_qid(
+    X: pd.DataFrame, y: pd.Series, qid: pd.Series, feature_cols: List[str]
+) -> Tuple[pd.DataFrame, np.ndarray, List[int], np.ndarray]:
+    """
+    Sort rows by qid so that group sizes match the sample order.
+    Returns:
+        X_sorted, y_sorted, groups, qid_sorted (aligned with X_sorted/y_sorted)
+    """
+    packed = X.copy()
+    packed["_label"] = y.values
+    packed["_qid"] = qid.values
+    packed = packed.sort_values("_qid").reset_index(drop=True)
+    groups = packed.groupby("_qid").size().tolist()
+    X_sorted = packed[feature_cols].copy()
+    y_sorted = packed["_label"].astype(float).values
+    qid_sorted = packed["_qid"].values
+    return X_sorted, y_sorted, groups, qid_sorted
+def _eval_mean_ndcg(
+    model: XGBRanker,
+    X_val: pd.DataFrame,
+    y_val,              # can be np.ndarray or pd.Series
+    qid_val,            # aligned with X_val/y_val
+    ks: Sequence[int] = (5, 10),
+) -> dict:
+    """
+    Compute mean NDCG@k for each k in ks over validation queries.
+    Accepts numpy arrays or pandas Series.
+    """
+    # Try to respect early-stopping best iteration if available (xgboost>=2.0)
+    try:
+        preds = model.predict(X_val, iteration_range=(0, model.best_iteration + 1))
+    except Exception:
+        preds = model.predict(X_val)
+    y_arr = np.asarray(y_val)
+    q_arr = np.asarray(qid_val)
+    out = {}
+    for k in ks:
+        ndcgs = []
+        for q in np.unique(q_arr):
+            mask = (q_arr == q)
+            if mask.sum() < 2:
+                continue
+            ndcgs.append(ndcg_score([y_arr[mask]], [preds[mask]], k=k))
+        out[f"NDCG@{k}"] = float(np.mean(ndcgs)) if ndcgs else 0.0
+    return out
+# ----------------------------- Main Trainer -----------------------------
+def train_model_ranker(
+    user_id: str = "user_1",
+    features_path: Optional[str] = None,
+    save_model: bool = True,
+    model_params: Optional[dict] = None,
+    val_ratio: float = 0.2,
+    random_state: int = 42,
+    max_rows: Optional[int] = None,
+):
+    """
+    Train an XGBoost Learning-to-Rank model (XGBRanker) on cold-start generated data.
+    Expected input CSV (from cold_start.py):
+      - qid:       query id (one round of pantry sampling = one query)
+      - relevance: graded relevance label (e.g., 3/2/1/0)
+      - features:  numeric columns produced by build_features (and any extra numeric signals)
+    The function:
+      1) Reads the CSV
+      2) Selects numeric feature columns robustly
+      3) Splits train/val by qid to avoid leakage
+      4) Sorts each split by qid and builds group sizes aligned to sample order
+      5) Trains XGBRanker and reports NDCG@5/10
+      6) Saves model to user_data/<user_id>/ranker.pkl
+    """
+    base_dir = os.path.join("recipe_recommendation", "user_data", user_id)
+    os.makedirs(base_dir, exist_ok=True)
+    # Resolve features path
+    if features_path is None:
+        features_path = os.path.join(base_dir, "user_features_rank.csv")
+    if not os.path.exists(features_path):
+        raise FileNotFoundError(
+            f"[train_model_ranker] Cold-start features not found at: {features_path}\n"
+            f"Please run cold_start_ranker(user_id='{user_id}') first."
+        )
+    # Load data
+    df = pd.read_csv(features_path)
+    if max_rows is not None and len(df) > max_rows:
+        df = df.sample(max_rows, random_state=random_state).reset_index(drop=True)
+    # Basic validation
+    if "qid" not in df.columns or "relevance" not in df.columns:
+        raise ValueError("Input CSV must contain 'qid' and 'relevance' columns.")
+    # Fill NaNs in label/qid (should not happen, but defensive)
+    df["qid"] = pd.to_numeric(df["qid"], errors="coerce").fillna(-1).astype(int)
+    df["relevance"] = pd.to_numeric(df["relevance"], errors="coerce").fillna(0).astype(float)
+    # Pick numeric feature columns robustly
+    feature_cols = FEATURE_COLS.copy()
+    df = df.reindex(columns=["qid", "relevance"] + feature_cols, fill_value=0)
+    # Ensure numeric + finite values only (replace inf/nan with 0)
+    df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")
+    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
+    # Split by qid to avoid leakage across queries
+    unique_qids = df["qid"].unique()
+    if len(unique_qids) < 2:
+        warnings.warn("Only one unique qid found — ranking training may be ineffective.")
+        train_mask = np.ones(len(df), dtype=bool)
+        val_mask = np.zeros(len(df), dtype=bool)
+    else:
+        train_qids, val_qids = train_test_split(
+            unique_qids, test_size=val_ratio, random_state=random_state
+        )
+        train_mask = df["qid"].isin(train_qids)
+        val_mask = df["qid"].isin(val_qids)
+    # Split dataframes AFTER defining masks
+    X_train_raw = df.loc[train_mask, feature_cols]
+    y_train_raw = df.loc[train_mask, "relevance"]
+    qid_train = df.loc[train_mask, "qid"]
+    X_val_raw = df.loc[val_mask, feature_cols]
+    y_val_raw = df.loc[val_mask, "relevance"]
+    qid_val = df.loc[val_mask, "qid"]
+    # Sort by qid and build group sizes aligned with sample order (CRITICAL for XGBRanker)
+    X_train, y_train, group_train, _ = _sort_and_pack_by_qid(
+    X_train_raw, y_train_raw, qid_train, feature_cols
+    )
+    X_val, y_val, group_val, qid_val_sorted = _sort_and_pack_by_qid(
+        X_val_raw, y_val_raw, qid_val, feature_cols
+    )
+    print(f"[ranker] #Train groups: {len(group_train)} | #Val groups: {len(group_val)}")
+    print(f"[ranker] Train rows: {len(X_train)} | Val rows: {len(X_val)} | #Features: {len(feature_cols)}")
+    # Default model params
+    default_params = dict(
+        objective="rank:ndcg",
+        eval_metric="ndcg",
+        n_estimators=400,
+        learning_rate=0.08,
+        max_depth=6,
+        subsample=0.8,
+        colsample_bytree=0.8,
+        random_state=random_state,
+        tree_method="hist",
+        reg_lambda=1.0,
+        reg_alpha=0.0,
+    )
+    if model_params:
+        default_params.update(model_params)
+    model = XGBRanker(**default_params)
+    # Fit model (XGBRanker requires group/group for eval_set as well)
+    fit_kwargs = dict(
+    X=X_train,
+    y=y_train,
+    group=group_train,
+    eval_set=[(X_val, y_val)],
+    eval_group=[group_val],
+    verbose=False,
+)
+    try:
+        # Newer xgboost versions (some builds) support early_stopping_rounds on Ranker
+        model.fit(early_stopping_rounds=50, **fit_kwargs)  # maximize=True is inferred by 'ndcg'
+    except TypeError:
+        # Fallback to callback API (older versions)
+        try:
+            from xgboost.callback import EarlyStopping
+            model.fit(callbacks=[EarlyStopping(rounds=50, save_best=True, maximize=True)], **fit_kwargs)
+        except Exception:
+            # Last resort: train without early stopping
+            model.fit(**fit_kwargs)
+    # Evaluate mean NDCG@5/10
+    metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
+    print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
+    # Evaluate mean NDCG@5/10
+    metrics = _eval_mean_ndcg(model, X_val, y_val, qid_val_sorted, ks=(5, 10))
+    print("[ranker] Validation metrics:", " ".join(f"{k}={v:.4f}" for k, v in metrics.items()))
+    # === Save NDCG metrics to log ===
+    from datetime import datetime
+    log_path = os.path.join(base_dir, "training_log.txt")
+    with open(log_path, "a", encoding="utf-8") as f:
+        ndcg5 = metrics.get("NDCG@5", 0.0)
+        ndcg10 = metrics.get("NDCG@10", 0.0)
+        f.write(f"{datetime.now().isoformat()} | NDCG@5={ndcg5:.4f}, NDCG@10={ndcg10:.4f}\n")
+    print(f"[ranker] Logged metrics to {log_path}")
+    # Save model
+    model_path = os.path.join(base_dir, "ranker.pkl")
+    joblib.dump(model, model_path)
+    print(f"[ranker] Model saved to {model_path}")
+    # Save model
+    if save_model:
+        model_path = os.path.join(base_dir, "ranker.pkl")
+        joblib.dump(model, model_path)
+        print(f"[ranker] Model saved to {model_path}")
+    return model, metrics, feature_cols
+if __name__ == "__main__":
+    # Example run
+    train_model_ranker(
+        user_id="user_1",
+        save_model=True,
+        val_ratio=0.2,
+        random_state=42,
+        max_rows=None,  # or set an upper bound for quick iterations, e.g., 200_000
+        model_params=None,  # override defaults if desired
+    )