Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- DataDrive_dataset.csv +0 -0
- app-2.py +108 -0
- create_embeddings.py +35 -0
- requirements.txt +6 -0
DataDrive_dataset.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app-2.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
import gradio as gr
|
| 7 |
+
|
| 8 |
+
DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv")
|
| 9 |
+
EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
|
| 10 |
+
ID_PATH = os.getenv("ID_PATH", "ids.csv")
|
| 11 |
+
|
| 12 |
+
DF = pd.read_csv(DATASET_CSV)
|
| 13 |
+
|
| 14 |
+
def ensure_embeddings():
|
| 15 |
+
if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH):
|
| 16 |
+
try:
|
| 17 |
+
from sentence_transformers import SentenceTransformer
|
| 18 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 19 |
+
embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
|
| 20 |
+
embs = np.asarray(embs, dtype="float32")
|
| 21 |
+
np.save(EMB_PATH, embs)
|
| 22 |
+
DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
|
| 23 |
+
except Exception as e:
|
| 24 |
+
raise RuntimeError(f"Embeddings not found and auto-build failed: {e}")
|
| 25 |
+
return np.load(EMB_PATH)
|
| 26 |
+
|
| 27 |
+
def _format_row(row):
|
| 28 |
+
return (
|
| 29 |
+
f"**{row['name']}** \n"
|
| 30 |
+
f"- Origin: {row['origin_country']} \n"
|
| 31 |
+
f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']} \n"
|
| 32 |
+
f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']} \n"
|
| 33 |
+
f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']} \n"
|
| 34 |
+
f"- MSRP (USD): ${int(row['msrp_usd']):,} \n"
|
| 35 |
+
f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10 \n"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
def search_and_recommend(query, k=3):
|
| 39 |
+
if not query or not query.strip():
|
| 40 |
+
return "Type a car name, brand, or model.", None, None
|
| 41 |
+
q = query.strip().lower()
|
| 42 |
+
mask = (
|
| 43 |
+
DF["name"].str.lower().str.contains(q) |
|
| 44 |
+
DF["model"].str.lower().str.contains(q) |
|
| 45 |
+
DF["make"].str.lower().str.contains(q)
|
| 46 |
+
)
|
| 47 |
+
if not mask.any():
|
| 48 |
+
return f"No match found for: {query}", None, None
|
| 49 |
+
idx = DF.index[mask][0]
|
| 50 |
+
anchor = DF.loc[idx]
|
| 51 |
+
|
| 52 |
+
embs = ensure_embeddings()
|
| 53 |
+
sims = cosine_similarity(embs[idx:idx+1], embs)[0]
|
| 54 |
+
sims[idx] = -1
|
| 55 |
+
top_idx = sims.argsort()[::-1][:k]
|
| 56 |
+
top_rows = DF.iloc[top_idx].copy()
|
| 57 |
+
top_rows["similarity"] = sims[top_idx]
|
| 58 |
+
|
| 59 |
+
anchor_md = _format_row(anchor)
|
| 60 |
+
recs_df = top_rows[[
|
| 61 |
+
"name","make","model","trim","year","origin_country","body_type",
|
| 62 |
+
"fuel","engine_type","horsepower","popularity_score","comfort_score"
|
| 63 |
+
]].copy()
|
| 64 |
+
recs_df["similarity"] = (top_rows["similarity"] * 100).round(1)
|
| 65 |
+
return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)."
|
| 66 |
+
|
| 67 |
+
def brand_compare(brands):
|
| 68 |
+
if not brands:
|
| 69 |
+
return None
|
| 70 |
+
sel = DF[DF["make"].isin(brands)].copy()
|
| 71 |
+
out = (sel.groupby(["make","origin_country"])
|
| 72 |
+
.agg(
|
| 73 |
+
samples=("name","count"),
|
| 74 |
+
avg_popularity=("popularity_score","mean"),
|
| 75 |
+
avg_comfort=("comfort_score","mean"),
|
| 76 |
+
avg_hp=("horsepower","mean"),
|
| 77 |
+
avg_msrp=("msrp_usd","mean"),
|
| 78 |
+
)
|
| 79 |
+
.reset_index())
|
| 80 |
+
out["avg_popularity"] = out["avg_popularity"].round(1)
|
| 81 |
+
out["avg_comfort"] = out["avg_comfort"].round(1)
|
| 82 |
+
out["avg_hp"] = out["avg_hp"].round(0).astype(int)
|
| 83 |
+
out["avg_msrp"] = out["avg_msrp"].round(0).astype(int)
|
| 84 |
+
return out
|
| 85 |
+
|
| 86 |
+
with gr.Blocks() as demo:
|
| 87 |
+
gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)")
|
| 88 |
+
with gr.Tab("Search + Recommend"):
|
| 89 |
+
gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.")
|
| 90 |
+
inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series")
|
| 91 |
+
topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations")
|
| 92 |
+
btn = gr.Button("Find & Recommend")
|
| 93 |
+
|
| 94 |
+
anchor_md = gr.Markdown()
|
| 95 |
+
recs_df = gr.Dataframe(interactive=False)
|
| 96 |
+
note = gr.Markdown()
|
| 97 |
+
|
| 98 |
+
btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
|
| 99 |
+
inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
|
| 100 |
+
|
| 101 |
+
with gr.Tab("Brand Compare"):
|
| 102 |
+
all_brands = sorted(DF["make"].unique().tolist())
|
| 103 |
+
brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"])
|
| 104 |
+
table = gr.Dataframe(interactive=False)
|
| 105 |
+
brands_inp.change(brand_compare, brands_inp, table)
|
| 106 |
+
table.value = brand_compare(["Toyota","Honda","BMW"])
|
| 107 |
+
|
| 108 |
+
demo.launch()
|
create_embeddings.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# Lazy import to allow CPU-only envs
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
|
| 9 |
+
DATASET_CSV = os.getenv("DATASET_CSV", "cars1200_text_dataset.csv")
|
| 10 |
+
EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
|
| 11 |
+
ID_PATH = os.getenv("ID_PATH", "ids.csv")
|
| 12 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
|
| 13 |
+
TEXT_COL = os.getenv("TEXT_COL", "text_record")
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
if not os.path.exists(DATASET_CSV):
|
| 17 |
+
raise FileNotFoundError(f"Dataset not found: {DATASET_CSV}")
|
| 18 |
+
df = pd.read_csv(DATASET_CSV)
|
| 19 |
+
if TEXT_COL not in df.columns:
|
| 20 |
+
raise KeyError(f"Column '{TEXT_COL}' not found in {DATASET_CSV}.")
|
| 21 |
+
|
| 22 |
+
print(f"Loading model: {MODEL_NAME}")
|
| 23 |
+
model = SentenceTransformer(MODEL_NAME)
|
| 24 |
+
|
| 25 |
+
texts = df[TEXT_COL].astype(str).tolist()
|
| 26 |
+
print(f"Encoding {len(texts)} records...")
|
| 27 |
+
embs = model.encode(texts, batch_size=256, show_progress_bar=True, normalize_embeddings=True)
|
| 28 |
+
embs = np.asarray(embs, dtype="float32")
|
| 29 |
+
|
| 30 |
+
np.save(EMB_PATH, embs)
|
| 31 |
+
df[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
|
| 32 |
+
print(f"Saved embeddings to {EMB_PATH} and ids to {ID_PATH}")
|
| 33 |
+
|
| 34 |
+
if __name__ == "__main__":
|
| 35 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
scikit-learn
|
| 4 |
+
gradio
|
| 5 |
+
sentence-transformers
|
| 6 |
+
torch
|