Spaces:

Levimichael4
/

DataDrive

Sleeping

App Files Files Community

Levimichael4 commited on Aug 9, 2025

Commit

2615cde

verified ·

1 Parent(s): 50eb4e3

Upload 4 files

Browse files

Files changed (4) hide show

DataDrive_dataset.csv +0 -0
app-2.py +108 -0
create_embeddings.py +35 -0
requirements.txt +6 -0

DataDrive_dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

app-2.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import numpy as np
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+import gradio as gr
+DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv")
+EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
+ID_PATH = os.getenv("ID_PATH", "ids.csv")
+DF = pd.read_csv(DATASET_CSV)
+def ensure_embeddings():
+    if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH):
+        try:
+            from sentence_transformers import SentenceTransformer
+            model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+            embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
+            embs = np.asarray(embs, dtype="float32")
+            np.save(EMB_PATH, embs)
+            DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
+        except Exception as e:
+            raise RuntimeError(f"Embeddings not found and auto-build failed: {e}")
+    return np.load(EMB_PATH)
+def _format_row(row):
+    return (
+        f"**{row['name']}**  \n"
+        f"- Origin: {row['origin_country']}  \n"
+        f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']}  \n"
+        f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']}  \n"
+        f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']}  \n"
+        f"- MSRP (USD): ${int(row['msrp_usd']):,}  \n"
+        f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10  \n"
+    )
+def search_and_recommend(query, k=3):
+    if not query or not query.strip():
+        return "Type a car name, brand, or model.", None, None
+    q = query.strip().lower()
+    mask = (
+        DF["name"].str.lower().str.contains(q) |
+        DF["model"].str.lower().str.contains(q) |
+        DF["make"].str.lower().str.contains(q)
+    )
+    if not mask.any():
+        return f"No match found for: {query}", None, None
+    idx = DF.index[mask][0]
+    anchor = DF.loc[idx]
+    embs = ensure_embeddings()
+    sims = cosine_similarity(embs[idx:idx+1], embs)[0]
+    sims[idx] = -1
+    top_idx = sims.argsort()[::-1][:k]
+    top_rows = DF.iloc[top_idx].copy()
+    top_rows["similarity"] = sims[top_idx]
+    anchor_md = _format_row(anchor)
+    recs_df = top_rows[[
+        "name","make","model","trim","year","origin_country","body_type",
+        "fuel","engine_type","horsepower","popularity_score","comfort_score"
+    ]].copy()
+    recs_df["similarity"] = (top_rows["similarity"] * 100).round(1)
+    return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)."
+def brand_compare(brands):
+    if not brands:
+        return None
+    sel = DF[DF["make"].isin(brands)].copy()
+    out = (sel.groupby(["make","origin_country"])
+        .agg(
+            samples=("name","count"),
+            avg_popularity=("popularity_score","mean"),
+            avg_comfort=("comfort_score","mean"),
+            avg_hp=("horsepower","mean"),
+            avg_msrp=("msrp_usd","mean"),
+        )
+        .reset_index())
+    out["avg_popularity"] = out["avg_popularity"].round(1)
+    out["avg_comfort"] = out["avg_comfort"].round(1)
+    out["avg_hp"] = out["avg_hp"].round(0).astype(int)
+    out["avg_msrp"] = out["avg_msrp"].round(0).astype(int)
+    return out
+with gr.Blocks() as demo:
+    gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)")
+    with gr.Tab("Search + Recommend"):
+        gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.")
+        inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series")
+        topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations")
+        btn = gr.Button("Find & Recommend")
+        anchor_md = gr.Markdown()
+        recs_df = gr.Dataframe(interactive=False)
+        note = gr.Markdown()
+        btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
+        inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
+    with gr.Tab("Brand Compare"):
+        all_brands = sorted(DF["make"].unique().tolist())
+        brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"])
+        table = gr.Dataframe(interactive=False)
+        brands_inp.change(brand_compare, brands_inp, table)
+        table.value = brand_compare(["Toyota","Honda","BMW"])
+demo.launch()

create_embeddings.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+import numpy as np
+import pandas as pd
+# Lazy import to allow CPU-only envs
+from sentence_transformers import SentenceTransformer
+DATASET_CSV = os.getenv("DATASET_CSV", "cars1200_text_dataset.csv")
+EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
+ID_PATH = os.getenv("ID_PATH", "ids.csv")
+MODEL_NAME = os.getenv("MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
+TEXT_COL = os.getenv("TEXT_COL", "text_record")
+def main():
+    if not os.path.exists(DATASET_CSV):
+        raise FileNotFoundError(f"Dataset not found: {DATASET_CSV}")
+    df = pd.read_csv(DATASET_CSV)
+    if TEXT_COL not in df.columns:
+        raise KeyError(f"Column '{TEXT_COL}' not found in {DATASET_CSV}.")
+    print(f"Loading model: {MODEL_NAME}")
+    model = SentenceTransformer(MODEL_NAME)
+    texts = df[TEXT_COL].astype(str).tolist()
+    print(f"Encoding {len(texts)} records...")
+    embs = model.encode(texts, batch_size=256, show_progress_bar=True, normalize_embeddings=True)
+    embs = np.asarray(embs, dtype="float32")
+    np.save(EMB_PATH, embs)
+    df[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
+    print(f"Saved embeddings to {EMB_PATH} and ids to {ID_PATH}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas
+numpy
+scikit-learn
+gradio
+sentence-transformers
+torch