Spaces:

MatanYehudaDataAnalyst
/

Finalproject_VEN

Sleeping

App Files Files Community

MatanYehudaDataAnalyst commited on 24 days ago

Commit

4f6f03e

verified ·

1 Parent(s): 25cf791

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -33

app.py CHANGED Viewed

@@ -3,79 +3,76 @@ import pandas as pd
 import numpy as np
 import pickle
 import os
-from sentence_transformers import SentenceTransformer, util
-import torch
 # ==========================================
 # 1. SETUP & DATA LOADING
 # ==========================================
-# NOTE: Check your file names exactly!
 csv_path = "cleaned_dataset_10k.csv"
 pkl_path = "final_embeddings_10k.pkl"
 if not os.path.exists(csv_path) or not os.path.exists(pkl_path):
-    # This error usually means the file names in the 'Files' tab are different
-    raise FileNotFoundError(f"❌ FILES NOT FOUND. I see these files: {os.listdir('.')}")
-# Load Data & Normalize Columns
 df = pd.read_csv(csv_path)
 df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
-# Helper to find columns even if names vary slightly
 def get_col(candidates, default):
     for c in candidates:
         if c in df.columns: return c
     return default
-col_name = get_col(['restaurant_name', 'name'], 'restaurant_name')
 col_rating = get_col(['rating', 'rating_score', 'stars'], 'rating')
 col_review = get_col(['review', 'review_content', 'review_content_clean'], 'review')
-col_persona = get_col(['reviewer_persona', 'persona'], 'reviewer_persona')
 # Load Embeddings
 with open(pkl_path, 'rb') as f:
     embedding_data = pickle.load(f)
-    dataset_embeddings = embedding_data['embeddings']
 # Load Model
 model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
-# Calculate Persona Profiles (Average Vectors)
 persona_profiles = {}
 if col_persona in df.columns:
     for persona in df[col_persona].unique():
         if pd.isna(persona): continue
         indices = df[df[col_persona] == persona].index
-        # valid_indices ensures we don't crash if indices mismatch
         valid_indices = [i for i in indices if i < len(dataset_embeddings)]
         if valid_indices:
             persona_vectors = dataset_embeddings[valid_indices]
-            # Use torch/numpy to calculate mean
             persona_profiles[persona] = np.mean(persona_vectors, axis=0)
 else:
     persona_profiles['Default'] = np.mean(dataset_embeddings, axis=0)
 # ==========================================
-# 2. LOGIC ENGINE (Replaced Scikit-Learn with Util)
 # ==========================================
 def run_ven_engine(budget, dietary, company, purpose, noise):
     # 1. Create a search query
     user_context = f"Searching for a {budget} experience, {dietary} friendly. Group: {company}. Occasion: {purpose}. Atmosphere: {noise}."
     # 2. Encode query
-    query_vec = model.encode(user_context, convert_to_tensor=True)
-    # 3. Find closest Persona using Sentence-Transformers Utility (No Sklearn needed)
-    best_score = -1
-    closest_persona = list(persona_profiles.keys())[0]
-    for persona, profile_vec in persona_profiles.items():
-        # Convert profile to tensor for comparison
-        profile_tensor = torch.tensor(profile_vec)
-        score = util.cos_sim(query_vec, profile_tensor).item()
-        if score > best_score:
-            best_score = score
-            closest_persona = persona
     # 4. Filter data
     if col_persona in df.columns:
@@ -88,18 +85,18 @@ def run_ven_engine(budget, dietary, company, purpose, noise):
     top_match = persona_df.sort_values(by=col_rating, ascending=False).iloc[0]
     # 6. Format Output
-    review_text = str(top_match[col_review])[:180] + "..."
-    match_pct = int(best_score * 100)
     return f"""
-    <div style="background: white; border: 1px solid #e2e8f0; border-radius: 20px; padding: 24px; box-shadow: 0 10px 30px -10px rgba(0,0,0,0.1);">
         <div style="display:flex; justify-content:space-between;">
             <div>
-                <div style="font-size: 24px; font-weight: 800; color: #1e293b;">{top_match[col_name]}</div>
-                <div style="font-size: 14px; color: #64748b; font-weight: 600;">Top Match for {closest_persona}</div>
             </div>
             <div style="text-align:right;">
-                <div style="font-size: 32px; font-weight: 900; color: #2563eb;">{top_match[col_rating]}</div>
                 <div style="font-size:12px; font-weight:bold; color:#94a3b8;">RATING</div>
             </div>
         </div>
@@ -129,7 +126,8 @@ with gr.Blocks(css=ven_css, title="VEN Project") as demo:
         with gr.Column():
             output_ui = gr.HTML("<h4>Recommendation will appear here...</h4>")
-    gr.Markdown("### 🚀 One-Click Examples")
     gr.Examples(
         examples=[
             ["Budget-friendly", "Vegetarian", "Friends", "Quick bite", "Moderate/Social"],
@@ -141,6 +139,7 @@ with gr.Blocks(css=ven_css, title="VEN Project") as demo:
         fn=run_ven_engine,
         cache_examples=True,
     )
     btn.click(run_ven_engine, inputs=[in_budget, in_diet, in_company, in_purpose, in_noise], outputs=output_ui)
 if __name__ == "__main__":

 import numpy as np
 import pickle
 import os
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
 # ==========================================
 # 1. SETUP & DATA LOADING
 # ==========================================
+# We use the EXACT filenames you provided
 csv_path = "cleaned_dataset_10k.csv"
 pkl_path = "final_embeddings_10k.pkl"
+# Check if files exist to prevent crashing
 if not os.path.exists(csv_path) or not os.path.exists(pkl_path):
+    raise FileNotFoundError(f"Error: Files not found. I see: {os.listdir('.')}")
+# Load Data
 df = pd.read_csv(csv_path)
+# Normalize column names (fixes 'Restaurant Name' vs 'restaurant_name' issues)
 df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
+# Helper to find the right column names
 def get_col(candidates, default):
     for c in candidates:
         if c in df.columns: return c
     return default
+# Map your CSV columns to what the app needs
+col_name = get_col(['restaurant_name', 'name', 'place'], 'restaurant_name')
 col_rating = get_col(['rating', 'rating_score', 'stars'], 'rating')
 col_review = get_col(['review', 'review_content', 'review_content_clean'], 'review')
+col_persona = get_col(['reviewer_persona', 'persona', 'type'], 'reviewer_persona')
 # Load Embeddings
 with open(pkl_path, 'rb') as f:
     embedding_data = pickle.load(f)
+    # Handle if pickle is a dictionary or direct array
+    if isinstance(embedding_data, dict) and 'embeddings' in embedding_data:
+        dataset_embeddings = embedding_data['embeddings']
+    else:
+        dataset_embeddings = embedding_data
 # Load Model
 model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+# Calculate Persona Profiles
 persona_profiles = {}
 if col_persona in df.columns:
     for persona in df[col_persona].unique():
         if pd.isna(persona): continue
         indices = df[df[col_persona] == persona].index
         valid_indices = [i for i in indices if i < len(dataset_embeddings)]
         if valid_indices:
             persona_vectors = dataset_embeddings[valid_indices]
             persona_profiles[persona] = np.mean(persona_vectors, axis=0)
 else:
     persona_profiles['Default'] = np.mean(dataset_embeddings, axis=0)
 # ==========================================
+# 2. LOGIC ENGINE
 # ==========================================
 def run_ven_engine(budget, dietary, company, purpose, noise):
     # 1. Create a search query
     user_context = f"Searching for a {budget} experience, {dietary} friendly. Group: {company}. Occasion: {purpose}. Atmosphere: {noise}."
     # 2. Encode query
+    query_vec = model.encode([user_context])
+    # 3. Find closest Persona
+    similarities = {p: cosine_similarity(query_vec, v.reshape(1, -1))[0][0] for p, v in persona_profiles.items()}
+    closest_persona = max(similarities, key=similarities.get)
     # 4. Filter data
     if col_persona in df.columns:
     top_match = persona_df.sort_values(by=col_rating, ascending=False).iloc[0]
     # 6. Format Output
+    match_pct = int(similarities[closest_persona] * 100)
+    review_text = str(top_match[col_review])[:160] + "..."
     return f"""
+    <div style="background: white; border: 1px solid #e2e8f0; border-radius: 20px; padding: 24px;">
         <div style="display:flex; justify-content:space-between;">
             <div>
+                <div style="font-size: 22px; font-weight: 800; color: #1e293b;">{top_match[col_name]}</div>
+                <div style="font-size: 14px; color: #64748b; font-weight: 600;">Match for: {closest_persona}</div>
             </div>
             <div style="text-align:right;">
+                <div style="font-size: 28px; font-weight: 900; color: #2563eb;">{top_match[col_rating]}</div>
                 <div style="font-size:12px; font-weight:bold; color:#94a3b8;">RATING</div>
             </div>
         </div>
         with gr.Column():
             output_ui = gr.HTML("<h4>Recommendation will appear here...</h4>")
+    # --- THIS IS STEP 7: ONE-CLICK STARTERS ---
+    gr.Markdown("### 🚀 Quick Starters (One-Click)")
     gr.Examples(
         examples=[
             ["Budget-friendly", "Vegetarian", "Friends", "Quick bite", "Moderate/Social"],
         fn=run_ven_engine,
         cache_examples=True,
     )
     btn.click(run_ven_engine, inputs=[in_budget, in_diet, in_company, in_purpose, in_noise], outputs=output_ui)
 if __name__ == "__main__":