File size: 4,521 Bytes
2615cde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr

DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv")
EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
ID_PATH = os.getenv("ID_PATH", "ids.csv")

DF = pd.read_csv(DATASET_CSV)

def ensure_embeddings():
    if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH):
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
            embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
            embs = np.asarray(embs, dtype="float32")
            np.save(EMB_PATH, embs)
            DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
        except Exception as e:
            raise RuntimeError(f"Embeddings not found and auto-build failed: {e}")
    return np.load(EMB_PATH)

def _format_row(row):
    return (
        f"**{row['name']}**  \n"
        f"- Origin: {row['origin_country']}  \n"
        f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']}  \n"
        f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']}  \n"
        f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']}  \n"
        f"- MSRP (USD): ${int(row['msrp_usd']):,}  \n"
        f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10  \n"
    )

def search_and_recommend(query, k=3):
    if not query or not query.strip():
        return "Type a car name, brand, or model.", None, None
    q = query.strip().lower()
    mask = (
        DF["name"].str.lower().str.contains(q) |
        DF["model"].str.lower().str.contains(q) |
        DF["make"].str.lower().str.contains(q)
    )
    if not mask.any():
        return f"No match found for: {query}", None, None
    idx = DF.index[mask][0]
    anchor = DF.loc[idx]

    embs = ensure_embeddings()
    sims = cosine_similarity(embs[idx:idx+1], embs)[0]
    sims[idx] = -1
    top_idx = sims.argsort()[::-1][:k]
    top_rows = DF.iloc[top_idx].copy()
    top_rows["similarity"] = sims[top_idx]

    anchor_md = _format_row(anchor)
    recs_df = top_rows[[
        "name","make","model","trim","year","origin_country","body_type",
        "fuel","engine_type","horsepower","popularity_score","comfort_score"
    ]].copy()
    recs_df["similarity"] = (top_rows["similarity"] * 100).round(1)
    return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)."

def brand_compare(brands):
    if not brands:
        return None
    sel = DF[DF["make"].isin(brands)].copy()
    out = (sel.groupby(["make","origin_country"])
        .agg(
            samples=("name","count"),
            avg_popularity=("popularity_score","mean"),
            avg_comfort=("comfort_score","mean"),
            avg_hp=("horsepower","mean"),
            avg_msrp=("msrp_usd","mean"),
        )
        .reset_index())
    out["avg_popularity"] = out["avg_popularity"].round(1)
    out["avg_comfort"] = out["avg_comfort"].round(1)
    out["avg_hp"] = out["avg_hp"].round(0).astype(int)
    out["avg_msrp"] = out["avg_msrp"].round(0).astype(int)
    return out

with gr.Blocks() as demo:
    gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)")
    with gr.Tab("Search + Recommend"):
        gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.")
        inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series")
        topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations")
        btn = gr.Button("Find & Recommend")

        anchor_md = gr.Markdown()
        recs_df = gr.Dataframe(interactive=False)
        note = gr.Markdown()

        btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
        inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])

    with gr.Tab("Brand Compare"):
        all_brands = sorted(DF["make"].unique().tolist())
        brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"])
        table = gr.Dataframe(interactive=False)
        brands_inp.change(brand_compare, brands_inp, table)
        table.value = brand_compare(["Toyota","Honda","BMW"])

demo.launch()