Iris314's picture
Upload 8 files
b9a4372 verified
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
def print_candidates(candidates, user_parents, topk=10):
shown = 0
max_score = candidates['match_score'].max()
min_score = candidates['match_score'].min()
for _, row in candidates.head(topk).iterrows():
scaled_score = 100 * row['match_score'] / (max_score + 1e-9)
print(f"{row['name']} (score {scaled_score:.1f}%)")
# ----- Region -----
region = row.get("region", None)
if pd.notna(region) and isinstance(region, str) and region.strip() and region.lower() != "unavailable":
print(f" region: {region}")
# ----- Cuisine Attributes -----
cuisine = row.get("cuisine_attr", None)
if cuisine is not None and not (isinstance(cuisine, float) and pd.isna(cuisine)):
# Convert set to list for printing
if isinstance(cuisine, set):
cuisine = list(cuisine)
elif isinstance(cuisine, str):
cuisine = [cuisine]
if isinstance(cuisine, list) and len(cuisine) > 0:
print(f" cuisine: {', '.join(cuisine)}")
# ----- Nutrition -----
print(f" calories: {row.get('calories', 'N/A')}")
# ----- Ingredient Marking -----
def mark_list(lst):
return [("✅ " + ing) if ing in user_parents else ("❌ " + ing) for ing in lst]
print(f" staple: {mark_list(row.get('staple_parent', []))}")
print(f" main: {mark_list(row.get('main_parent', []))}")
print(f" seasoning: {row.get('seasoning_parent', [])}")
print(f" other: {mark_list(row.get('other_parent', []))}")
print("-" * 40)
shown += 1
def diversify_topk_with_min_clusters(
ranked_candidates,
feature_matrix,
top_k=5,
n_clusters=20,
min_clusters=3,
random_state=42
):
"""
Diversify top-k displayed recipes using KMeans clustering.
Ensures that the final top_k contains at least `min_clusters` distinct clusters.
"""
if len(ranked_candidates) == 0:
return []
n_clusters = min(n_clusters, len(ranked_candidates))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(feature_matrix)
# KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=random_state)
cluster_ids = kmeans.fit_predict(X_scaled)
# Step 1: pick candidates from distinct clusters until min_clusters reached
picked = []
picked_clusters = set()
for i, c in enumerate(cluster_ids):
if c not in picked_clusters:
picked.append(ranked_candidates[i])
picked_clusters.add(c)
if len(picked_clusters) >= min_clusters or len(picked) >= top_k:
break
# Step 2: fill the rest purely by rank order
if len(picked) < top_k:
for i, c in enumerate(cluster_ids):
if ranked_candidates[i] not in picked:
picked.append(ranked_candidates[i])
if len(picked) >= top_k:
break
return picked