Spaces:
Sleeping
Sleeping
| import os | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import gradio as gr | |
| DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv") | |
| EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy") | |
| ID_PATH = os.getenv("ID_PATH", "ids.csv") | |
| DF = pd.read_csv(DATASET_CSV) | |
| def ensure_embeddings(): | |
| if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH): | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True) | |
| embs = np.asarray(embs, dtype="float32") | |
| np.save(EMB_PATH, embs) | |
| DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False) | |
| except Exception as e: | |
| raise RuntimeError(f"Embeddings not found and auto-build failed: {e}") | |
| return np.load(EMB_PATH) | |
| def _format_row(row): | |
| return ( | |
| f"**{row['name']}** \n" | |
| f"- Origin: {row['origin_country']} \n" | |
| f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']} \n" | |
| f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']} \n" | |
| f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']} \n" | |
| f"- MSRP (USD): ${int(row['msrp_usd']):,} \n" | |
| f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10 \n" | |
| ) | |
| def search_and_recommend(query, k=3): | |
| if not query or not query.strip(): | |
| return "Type a car name, brand, or model.", None, None | |
| q = query.strip().lower() | |
| mask = ( | |
| DF["name"].str.lower().str.contains(q) | | |
| DF["model"].str.lower().str.contains(q) | | |
| DF["make"].str.lower().str.contains(q) | |
| ) | |
| if not mask.any(): | |
| return f"No match found for: {query}", None, None | |
| idx = DF.index[mask][0] | |
| anchor = DF.loc[idx] | |
| embs = ensure_embeddings() | |
| sims = cosine_similarity(embs[idx:idx+1], embs)[0] | |
| sims[idx] = -1 | |
| top_idx = sims.argsort()[::-1][:k] | |
| top_rows = DF.iloc[top_idx].copy() | |
| top_rows["similarity"] = sims[top_idx] | |
| anchor_md = _format_row(anchor) | |
| recs_df = top_rows[[ | |
| "name","make","model","trim","year","origin_country","body_type", | |
| "fuel","engine_type","horsepower","popularity_score","comfort_score" | |
| ]].copy() | |
| recs_df["similarity"] = (top_rows["similarity"] * 100).round(1) | |
| return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)." | |
| def brand_compare(brands): | |
| if not brands: | |
| return None | |
| sel = DF[DF["make"].isin(brands)].copy() | |
| out = (sel.groupby(["make","origin_country"]) | |
| .agg( | |
| samples=("name","count"), | |
| avg_popularity=("popularity_score","mean"), | |
| avg_comfort=("comfort_score","mean"), | |
| avg_hp=("horsepower","mean"), | |
| avg_msrp=("msrp_usd","mean"), | |
| ) | |
| .reset_index()) | |
| out["avg_popularity"] = out["avg_popularity"].round(1) | |
| out["avg_comfort"] = out["avg_comfort"].round(1) | |
| out["avg_hp"] = out["avg_hp"].round(0).astype(int) | |
| out["avg_msrp"] = out["avg_msrp"].round(0).astype(int) | |
| return out | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)") | |
| with gr.Tab("Search + Recommend"): | |
| gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.") | |
| inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series") | |
| topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations") | |
| btn = gr.Button("Find & Recommend") | |
| anchor_md = gr.Markdown() | |
| recs_df = gr.Dataframe(interactive=False) | |
| note = gr.Markdown() | |
| btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note]) | |
| inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note]) | |
| with gr.Tab("Brand Compare"): | |
| all_brands = sorted(DF["make"].unique().tolist()) | |
| brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"]) | |
| table = gr.Dataframe(interactive=False) | |
| brands_inp.change(brand_compare, brands_inp, table) | |
| table.value = brand_compare(["Toyota","Honda","BMW"]) | |
| demo.launch() | |