# Trigger rebuild import os, pathlib, numpy as np, pandas as pd, gradio as gr from huggingface_hub import hf_hub_download from sentence_transformers import SentenceTransformer # --- CONFIG --- HF_DATASET_REPO = "miazaitman/CheatClean" HF_DATASET_FILE = "CheatClean Data set.csv" # keep spaces DATA_DIR = pathlib.Path("./data"); DATA_DIR.mkdir(exist_ok=True) DATA_LOCAL = DATA_DIR / HF_DATASET_FILE EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # --- Load dataset --- def load_dataset(): if not DATA_LOCAL.exists(): hf_hub_download( repo_id=HF_DATASET_REPO, filename=HF_DATASET_FILE, repo_type="dataset", local_dir=str(DATA_DIR), local_dir_use_symlinks=False ) df = pd.read_csv(DATA_LOCAL) needed = [ "Unhealthy_Food", "Alt1_Name","Alt1_Description","Alt1_Estimated_Calorie_Delta_kcal","Alt1_Macro_Delta","Alt1_Tip", "Alt2_Name","Alt2_Description","Alt2_Estimated_Calorie_Delta_kcal","Alt2_Macro_Delta","Alt2_Tip", "Alt3_Name","Alt3_Description","Alt3_Estimated_Calorie_Delta_kcal","Alt3_Macro_Delta","Alt3_Tip", ] missing = [c for c in needed if c not in df.columns] if missing: raise ValueError(f"Missing columns: {missing}") return df.dropna(subset=["Unhealthy_Food"]).reset_index(drop=True) # --- Embeddings (no FAISS) --- def build_embeddings(texts): model = SentenceTransformer(EMBED_MODEL_NAME) embs = model.encode(list(texts), convert_to_numpy=True, show_progress_bar=True) norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12 embs = embs / norms # L2-normalize for cosine similarity return model, embs def cosine_top_row(query, model, embs): if not query or not query.strip(): return None q = query.strip() q_emb = model.encode([q], convert_to_numpy=True) q_emb = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-12) scores = embs @ q_emb.T # shape (N,1) return int(np.argmax(scores[:, 0])) def to_three_alternatives(row): return [ {"Rank": 1, "Healthier Alternative": row["Alt1_Name"], "Description": row["Alt1_Description"], "Calorie/Nutrient Difference": f'{row["Alt1_Estimated_Calorie_Delta_kcal"]} kcal; {row["Alt1_Macro_Delta"]}', "Tip": row["Alt1_Tip"]}, {"Rank": 2, "Healthier Alternative": row["Alt2_Name"], "Description": row["Alt2_Description"], "Calorie/Nutrient Difference": f'{row["Alt2_Estimated_Calorie_Delta_kcal"]} kcal; {row["Alt2_Macro_Delta"]}', "Tip": row["Alt2_Tip"]}, {"Rank": 3, "Healthier Alternative": row["Alt3_Name"], "Description": row["Alt3_Description"], "Calorie/Nutrient Difference": f'{row["Alt3_Estimated_Calorie_Delta_kcal"]} kcal; {row["Alt3_Macro_Delta"]}', "Tip": row["Alt3_Tip"]}, ] # --- UI --- def search_ui(user_food): idx = cosine_top_row(user_food, model, embs) if idx is None: return f"**You entered:** _{user_food}_\n\nNo matches found.", None row = df.iloc[idx] echoed = f"**You entered:** _{user_food}_" table = pd.DataFrame( to_three_alternatives(row), columns=["Rank","Healthier Alternative","Description","Calorie/Nutrient Difference","Tip"] ) return echoed, table def build_interface(): examples = [["Hamburger"],["Cheeseburger"],["Pepperoni Pizza"], ["Fried Chicken Sandwich"],["Nachos"],["Mac and Cheese"]] with gr.Blocks(title="Healthy Food Alternatives") as demo: gr.Markdown("# 🥗 Healthy Food Alternatives\nType a food you like to see healthier options.") with gr.Row(): with gr.Column(scale=1): inp = gr.Textbox(label="Enter a food you like", placeholder="e.g., Hamburger") btn = gr.Button("Find Healthier Alternatives", variant="primary") gr.Examples(examples=examples, inputs=inp, label="Try one") with gr.Column(scale=2): echoed = gr.Markdown() table = gr.Dataframe(headers=["Rank","Healthier Alternative","Description","Calorie/Nutrient Difference","Tip"], row_count=(3,"fixed"), wrap=True) btn.click(search_ui, inputs=inp, outputs=[echoed, table]) inp.submit(search_ui, inputs=inp, outputs=[echoed, table]) return demo # --- Boot --- df = load_dataset() model, embs = build_embeddings(df["Unhealthy_Food"].astype(str).tolist()) def quick_eval(samples=("Hamburger","Nachos","Pepperoni Pizza")): print("=== Quick Eval (cosine top-1 row -> 3 alts) ===") for s in samples: idx = cosine_top_row(s, model, embs) r = df.iloc[idx] print(f"\nQuery: {s} -> Row match: {r['Unhealthy_Food']}") for x in to_three_alternatives(r): print(f" {x['Rank']}. {x['Healthier Alternative']} | {x['Calorie/Nutrient Difference']}") # call it once at startup quick_eval() app = build_interface() if __name__ == "__main__": app.launch()