Spaces:
Sleeping
Sleeping
File size: 4,521 Bytes
2615cde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv")
EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
ID_PATH = os.getenv("ID_PATH", "ids.csv")
DF = pd.read_csv(DATASET_CSV)
def ensure_embeddings():
if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH):
try:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
embs = np.asarray(embs, dtype="float32")
np.save(EMB_PATH, embs)
DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
except Exception as e:
raise RuntimeError(f"Embeddings not found and auto-build failed: {e}")
return np.load(EMB_PATH)
def _format_row(row):
return (
f"**{row['name']}** \n"
f"- Origin: {row['origin_country']} \n"
f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']} \n"
f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']} \n"
f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']} \n"
f"- MSRP (USD): ${int(row['msrp_usd']):,} \n"
f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10 \n"
)
def search_and_recommend(query, k=3):
if not query or not query.strip():
return "Type a car name, brand, or model.", None, None
q = query.strip().lower()
mask = (
DF["name"].str.lower().str.contains(q) |
DF["model"].str.lower().str.contains(q) |
DF["make"].str.lower().str.contains(q)
)
if not mask.any():
return f"No match found for: {query}", None, None
idx = DF.index[mask][0]
anchor = DF.loc[idx]
embs = ensure_embeddings()
sims = cosine_similarity(embs[idx:idx+1], embs)[0]
sims[idx] = -1
top_idx = sims.argsort()[::-1][:k]
top_rows = DF.iloc[top_idx].copy()
top_rows["similarity"] = sims[top_idx]
anchor_md = _format_row(anchor)
recs_df = top_rows[[
"name","make","model","trim","year","origin_country","body_type",
"fuel","engine_type","horsepower","popularity_score","comfort_score"
]].copy()
recs_df["similarity"] = (top_rows["similarity"] * 100).round(1)
return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)."
def brand_compare(brands):
if not brands:
return None
sel = DF[DF["make"].isin(brands)].copy()
out = (sel.groupby(["make","origin_country"])
.agg(
samples=("name","count"),
avg_popularity=("popularity_score","mean"),
avg_comfort=("comfort_score","mean"),
avg_hp=("horsepower","mean"),
avg_msrp=("msrp_usd","mean"),
)
.reset_index())
out["avg_popularity"] = out["avg_popularity"].round(1)
out["avg_comfort"] = out["avg_comfort"].round(1)
out["avg_hp"] = out["avg_hp"].round(0).astype(int)
out["avg_msrp"] = out["avg_msrp"].round(0).astype(int)
return out
with gr.Blocks() as demo:
gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)")
with gr.Tab("Search + Recommend"):
gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.")
inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series")
topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations")
btn = gr.Button("Find & Recommend")
anchor_md = gr.Markdown()
recs_df = gr.Dataframe(interactive=False)
note = gr.Markdown()
btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
with gr.Tab("Brand Compare"):
all_brands = sorted(DF["make"].unique().tolist())
brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"])
table = gr.Dataframe(interactive=False)
brands_inp.change(brand_compare, brands_inp, table)
table.value = brand_compare(["Toyota","Honda","BMW"])
demo.launch()
|