import os import numpy as np import pandas as pd from sklearn.metrics.pairwise import cosine_similarity import gradio as gr DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv") EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy") ID_PATH = os.getenv("ID_PATH", "ids.csv") DF = pd.read_csv(DATASET_CSV) def ensure_embeddings(): if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH): try: from sentence_transformers import SentenceTransformer model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True) embs = np.asarray(embs, dtype="float32") np.save(EMB_PATH, embs) DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False) except Exception as e: raise RuntimeError(f"Embeddings not found and auto-build failed: {e}") return np.load(EMB_PATH) def _format_row(row): return ( f"**{row['name']}** \n" f"- Origin: {row['origin_country']} \n" f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']} \n" f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']} \n" f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']} \n" f"- MSRP (USD): ${int(row['msrp_usd']):,} \n" f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10 \n" ) def search_and_recommend(query, k=3): if not query or not query.strip(): return "Type a car name, brand, or model.", None, None q = query.strip().lower() mask = ( DF["name"].str.lower().str.contains(q) | DF["model"].str.lower().str.contains(q) | DF["make"].str.lower().str.contains(q) ) if not mask.any(): return f"No match found for: {query}", None, None idx = DF.index[mask][0] anchor = DF.loc[idx] embs = ensure_embeddings() sims = cosine_similarity(embs[idx:idx+1], embs)[0] sims[idx] = -1 top_idx = sims.argsort()[::-1][:k] top_rows = DF.iloc[top_idx].copy() top_rows["similarity"] = sims[top_idx] anchor_md = _format_row(anchor) recs_df = top_rows[[ "name","make","model","trim","year","origin_country","body_type", "fuel","engine_type","horsepower","popularity_score","comfort_score" ]].copy() recs_df["similarity"] = (top_rows["similarity"] * 100).round(1) return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)." def brand_compare(brands): if not brands: return None sel = DF[DF["make"].isin(brands)].copy() out = (sel.groupby(["make","origin_country"]) .agg( samples=("name","count"), avg_popularity=("popularity_score","mean"), avg_comfort=("comfort_score","mean"), avg_hp=("horsepower","mean"), avg_msrp=("msrp_usd","mean"), ) .reset_index()) out["avg_popularity"] = out["avg_popularity"].round(1) out["avg_comfort"] = out["avg_comfort"].round(1) out["avg_hp"] = out["avg_hp"].round(0).astype(int) out["avg_msrp"] = out["avg_msrp"].round(0).astype(int) return out with gr.Blocks() as demo: gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)") with gr.Tab("Search + Recommend"): gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.") inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series") topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations") btn = gr.Button("Find & Recommend") anchor_md = gr.Markdown() recs_df = gr.Dataframe(interactive=False) note = gr.Markdown() btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note]) inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note]) with gr.Tab("Brand Compare"): all_brands = sorted(DF["make"].unique().tolist()) brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"]) table = gr.Dataframe(interactive=False) brands_inp.change(brand_compare, brands_inp, table) table.value = brand_compare(["Toyota","Honda","BMW"]) demo.launch()