Levimichael4 commited on
Commit
2615cde
·
verified ·
1 Parent(s): 50eb4e3

Upload 4 files

Browse files
Files changed (4) hide show
  1. DataDrive_dataset.csv +0 -0
  2. app-2.py +108 -0
  3. create_embeddings.py +35 -0
  4. requirements.txt +6 -0
DataDrive_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
app-2.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import numpy as np
4
+ import pandas as pd
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import gradio as gr
7
+
8
+ DATASET_CSV = os.getenv("DATASET_CSV", "DataDrive_dataset.csv")
9
+ EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
10
+ ID_PATH = os.getenv("ID_PATH", "ids.csv")
11
+
12
+ DF = pd.read_csv(DATASET_CSV)
13
+
14
+ def ensure_embeddings():
15
+ if not os.path.exists(EMB_PATH) or not os.path.exists(ID_PATH):
16
+ try:
17
+ from sentence_transformers import SentenceTransformer
18
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
19
+ embs = model.encode(DF["text_record"].astype(str).tolist(), batch_size=256, show_progress_bar=True, normalize_embeddings=True)
20
+ embs = np.asarray(embs, dtype="float32")
21
+ np.save(EMB_PATH, embs)
22
+ DF[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
23
+ except Exception as e:
24
+ raise RuntimeError(f"Embeddings not found and auto-build failed: {e}")
25
+ return np.load(EMB_PATH)
26
+
27
+ def _format_row(row):
28
+ return (
29
+ f"**{row['name']}** \n"
30
+ f"- Origin: {row['origin_country']} \n"
31
+ f"- Body: {row['body_type']} | Fuel: {row['fuel']} | Engine: {row['engine_type']} \n"
32
+ f"- Drivetrain: {row['drivetrain']} | Transmission: {row['transmission']} \n"
33
+ f"- HP: {int(row['horsepower'])} | Seats: {int(row['seats'])} | Efficiency: {row['efficiency']} \n"
34
+ f"- MSRP (USD): ${int(row['msrp_usd']):,} \n"
35
+ f"- Popularity: {int(row['popularity_score'])}/10 | Comfort: {int(row['comfort_score'])}/10 \n"
36
+ )
37
+
38
+ def search_and_recommend(query, k=3):
39
+ if not query or not query.strip():
40
+ return "Type a car name, brand, or model.", None, None
41
+ q = query.strip().lower()
42
+ mask = (
43
+ DF["name"].str.lower().str.contains(q) |
44
+ DF["model"].str.lower().str.contains(q) |
45
+ DF["make"].str.lower().str.contains(q)
46
+ )
47
+ if not mask.any():
48
+ return f"No match found for: {query}", None, None
49
+ idx = DF.index[mask][0]
50
+ anchor = DF.loc[idx]
51
+
52
+ embs = ensure_embeddings()
53
+ sims = cosine_similarity(embs[idx:idx+1], embs)[0]
54
+ sims[idx] = -1
55
+ top_idx = sims.argsort()[::-1][:k]
56
+ top_rows = DF.iloc[top_idx].copy()
57
+ top_rows["similarity"] = sims[top_idx]
58
+
59
+ anchor_md = _format_row(anchor)
60
+ recs_df = top_rows[[
61
+ "name","make","model","trim","year","origin_country","body_type",
62
+ "fuel","engine_type","horsepower","popularity_score","comfort_score"
63
+ ]].copy()
64
+ recs_df["similarity"] = (top_rows["similarity"] * 100).round(1)
65
+ return anchor_md, recs_df, f"Top {k} similar results shown (by cosine similarity on text embeddings)."
66
+
67
+ def brand_compare(brands):
68
+ if not brands:
69
+ return None
70
+ sel = DF[DF["make"].isin(brands)].copy()
71
+ out = (sel.groupby(["make","origin_country"])
72
+ .agg(
73
+ samples=("name","count"),
74
+ avg_popularity=("popularity_score","mean"),
75
+ avg_comfort=("comfort_score","mean"),
76
+ avg_hp=("horsepower","mean"),
77
+ avg_msrp=("msrp_usd","mean"),
78
+ )
79
+ .reset_index())
80
+ out["avg_popularity"] = out["avg_popularity"].round(1)
81
+ out["avg_comfort"] = out["avg_comfort"].round(1)
82
+ out["avg_hp"] = out["avg_hp"].round(0).astype(int)
83
+ out["avg_msrp"] = out["avg_msrp"].round(0).astype(int)
84
+ return out
85
+
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown("# DataDrive — Cars Recommender (Text, 1,200 records)")
88
+ with gr.Tab("Search + Recommend"):
89
+ gr.Markdown("Enter a car (brand/model/name). We show the match and the **Top-3 similar cars**.")
90
+ inp = gr.Textbox(label="Search", placeholder="e.g., Toyota Corolla, Model 3, Golf, BMW 3 Series")
91
+ topk = gr.Slider(1, 5, value=3, step=1, label="Number of recommendations")
92
+ btn = gr.Button("Find & Recommend")
93
+
94
+ anchor_md = gr.Markdown()
95
+ recs_df = gr.Dataframe(interactive=False)
96
+ note = gr.Markdown()
97
+
98
+ btn.click(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
99
+ inp.submit(search_and_recommend, [inp, topk], [anchor_md, recs_df, note])
100
+
101
+ with gr.Tab("Brand Compare"):
102
+ all_brands = sorted(DF["make"].unique().tolist())
103
+ brands_inp = gr.CheckboxGroup(choices=all_brands, label="Pick brands", value=["Toyota","Honda","BMW"])
104
+ table = gr.Dataframe(interactive=False)
105
+ brands_inp.change(brand_compare, brands_inp, table)
106
+ table.value = brand_compare(["Toyota","Honda","BMW"])
107
+
108
+ demo.launch()
create_embeddings.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ # Lazy import to allow CPU-only envs
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ DATASET_CSV = os.getenv("DATASET_CSV", "cars1200_text_dataset.csv")
10
+ EMB_PATH = os.getenv("EMB_PATH", "embeddings.npy")
11
+ ID_PATH = os.getenv("ID_PATH", "ids.csv")
12
+ MODEL_NAME = os.getenv("MODEL_NAME", "sentence-transformers/all-MiniLM-L6-v2")
13
+ TEXT_COL = os.getenv("TEXT_COL", "text_record")
14
+
15
+ def main():
16
+ if not os.path.exists(DATASET_CSV):
17
+ raise FileNotFoundError(f"Dataset not found: {DATASET_CSV}")
18
+ df = pd.read_csv(DATASET_CSV)
19
+ if TEXT_COL not in df.columns:
20
+ raise KeyError(f"Column '{TEXT_COL}' not found in {DATASET_CSV}.")
21
+
22
+ print(f"Loading model: {MODEL_NAME}")
23
+ model = SentenceTransformer(MODEL_NAME)
24
+
25
+ texts = df[TEXT_COL].astype(str).tolist()
26
+ print(f"Encoding {len(texts)} records...")
27
+ embs = model.encode(texts, batch_size=256, show_progress_bar=True, normalize_embeddings=True)
28
+ embs = np.asarray(embs, dtype="float32")
29
+
30
+ np.save(EMB_PATH, embs)
31
+ df[["name","make","model","trim","year"]].to_csv(ID_PATH, index=False)
32
+ print(f"Saved embeddings to {EMB_PATH} and ids to {ID_PATH}")
33
+
34
+ if __name__ == "__main__":
35
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ gradio
5
+ sentence-transformers
6
+ torch