cheat-clean / app.py
miazaitman's picture
Update app.py
e860f71 verified
# Trigger rebuild
import os, pathlib, numpy as np, pandas as pd, gradio as gr
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
# --- CONFIG ---
HF_DATASET_REPO = "miazaitman/CheatClean"
HF_DATASET_FILE = "CheatClean Data set.csv" # keep spaces
DATA_DIR = pathlib.Path("./data"); DATA_DIR.mkdir(exist_ok=True)
DATA_LOCAL = DATA_DIR / HF_DATASET_FILE
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# --- Load dataset ---
def load_dataset():
if not DATA_LOCAL.exists():
hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=HF_DATASET_FILE,
repo_type="dataset",
local_dir=str(DATA_DIR),
local_dir_use_symlinks=False
)
df = pd.read_csv(DATA_LOCAL)
needed = [
"Unhealthy_Food",
"Alt1_Name","Alt1_Description","Alt1_Estimated_Calorie_Delta_kcal","Alt1_Macro_Delta","Alt1_Tip",
"Alt2_Name","Alt2_Description","Alt2_Estimated_Calorie_Delta_kcal","Alt2_Macro_Delta","Alt2_Tip",
"Alt3_Name","Alt3_Description","Alt3_Estimated_Calorie_Delta_kcal","Alt3_Macro_Delta","Alt3_Tip",
]
missing = [c for c in needed if c not in df.columns]
if missing:
raise ValueError(f"Missing columns: {missing}")
return df.dropna(subset=["Unhealthy_Food"]).reset_index(drop=True)
# --- Embeddings (no FAISS) ---
def build_embeddings(texts):
model = SentenceTransformer(EMBED_MODEL_NAME)
embs = model.encode(list(texts), convert_to_numpy=True, show_progress_bar=True)
norms = np.linalg.norm(embs, axis=1, keepdims=True) + 1e-12
embs = embs / norms # L2-normalize for cosine similarity
return model, embs
def cosine_top_row(query, model, embs):
if not query or not query.strip():
return None
q = query.strip()
q_emb = model.encode([q], convert_to_numpy=True)
q_emb = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-12)
scores = embs @ q_emb.T # shape (N,1)
return int(np.argmax(scores[:, 0]))
def to_three_alternatives(row):
return [
{"Rank": 1, "Healthier Alternative": row["Alt1_Name"],
"Description": row["Alt1_Description"],
"Calorie/Nutrient Difference": f'{row["Alt1_Estimated_Calorie_Delta_kcal"]} kcal; {row["Alt1_Macro_Delta"]}',
"Tip": row["Alt1_Tip"]},
{"Rank": 2, "Healthier Alternative": row["Alt2_Name"],
"Description": row["Alt2_Description"],
"Calorie/Nutrient Difference": f'{row["Alt2_Estimated_Calorie_Delta_kcal"]} kcal; {row["Alt2_Macro_Delta"]}',
"Tip": row["Alt2_Tip"]},
{"Rank": 3, "Healthier Alternative": row["Alt3_Name"],
"Description": row["Alt3_Description"],
"Calorie/Nutrient Difference": f'{row["Alt3_Estimated_Calorie_Delta_kcal"]} kcal; {row["Alt3_Macro_Delta"]}',
"Tip": row["Alt3_Tip"]},
]
# --- UI ---
def search_ui(user_food):
idx = cosine_top_row(user_food, model, embs)
if idx is None:
return f"**You entered:** _{user_food}_\n\nNo matches found.", None
row = df.iloc[idx]
echoed = f"**You entered:** _{user_food}_"
table = pd.DataFrame(
to_three_alternatives(row),
columns=["Rank","Healthier Alternative","Description","Calorie/Nutrient Difference","Tip"]
)
return echoed, table
def build_interface():
examples = [["Hamburger"],["Cheeseburger"],["Pepperoni Pizza"],
["Fried Chicken Sandwich"],["Nachos"],["Mac and Cheese"]]
with gr.Blocks(title="Healthy Food Alternatives") as demo:
gr.Markdown("# 🥗 Healthy Food Alternatives\nType a food you like to see healthier options.")
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(label="Enter a food you like", placeholder="e.g., Hamburger")
btn = gr.Button("Find Healthier Alternatives", variant="primary")
gr.Examples(examples=examples, inputs=inp, label="Try one")
with gr.Column(scale=2):
echoed = gr.Markdown()
table = gr.Dataframe(headers=["Rank","Healthier Alternative","Description","Calorie/Nutrient Difference","Tip"],
row_count=(3,"fixed"), wrap=True)
btn.click(search_ui, inputs=inp, outputs=[echoed, table])
inp.submit(search_ui, inputs=inp, outputs=[echoed, table])
return demo
# --- Boot ---
df = load_dataset()
model, embs = build_embeddings(df["Unhealthy_Food"].astype(str).tolist())
def quick_eval(samples=("Hamburger","Nachos","Pepperoni Pizza")):
print("=== Quick Eval (cosine top-1 row -> 3 alts) ===")
for s in samples:
idx = cosine_top_row(s, model, embs)
r = df.iloc[idx]
print(f"\nQuery: {s} -> Row match: {r['Unhealthy_Food']}")
for x in to_three_alternatives(r):
print(f" {x['Rank']}. {x['Healthier Alternative']} | {x['Calorie/Nutrient Difference']}")
# call it once at startup
quick_eval()
app = build_interface()
if __name__ == "__main__":
app.launch()