knguyen471 commited on
Commit
83dc914
·
verified ·
1 Parent(s): 9ad7210

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +26 -8
  2. main.py +22 -5
app.py CHANGED
@@ -191,17 +191,35 @@ def create_paris_map(results_df):
191
  # import traceback
192
  # return f"Error: {str(e)}\n\n{traceback.format_exc()}", None
193
 
194
- # def search_restaurants(query, data_source, search_method, num_results, use_popularity):
195
- # """Main search function that routes to appropriate search method"""
196
- # if search_method == "Semantic Search" and use_semantic:
197
- # return semantic_search(query, data_source, num_results, use_popularity)
198
- # else:
199
- # return keyword_search(query, data_source, num_results, use_popularity)
200
 
201
  def search_restaurants(query_input, data_source, num_results):
202
  n_candidates = 100
203
  query_clean = clean_text(query_input)
204
- return get_recommendations(query_clean, n_candidates, num_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  # Create Gradio interface
207
  with gr.Blocks(
@@ -225,7 +243,7 @@ with gr.Blocks(
225
 
226
  with gr.Column(scale=2):
227
  data_source = gr.Dropdown(
228
- choices=["Michelin", "Google", "Yelp"],
229
  value="Yelp",
230
  label="Data Source",
231
  info="Select restaurant data source"
 
191
  # import traceback
192
  # return f"Error: {str(e)}\n\n{traceback.format_exc()}", None
193
 
 
 
 
 
 
 
194
 
195
  def search_restaurants(query_input, data_source, num_results):
196
  n_candidates = 100
197
  query_clean = clean_text(query_input)
198
+ restaurant_ids = get_recommendations(query_clean, n_candidates, num_results, data_source)
199
+
200
+ # Subset data for recommendedations
201
+ results = data[data["id"].isin(restaurant_ids)]
202
+ map_html = create_paris_map(results)
203
+
204
+ output = f"Found {len(results)} restaurants for '{query_input}'\n"
205
+ output += f"Data Source: {data_source}\n"
206
+
207
+ for idx, (_, row) in enumerate(results.iterrows(), 1):
208
+ name = row.get('name', 'Unknown')
209
+ rating = row.get('overall_rating', 'N/A')
210
+ reviews = row.get('review_count', 'N/A')
211
+
212
+ output += f"{idx}. **{name}**\n"
213
+ output += f" Rating: {rating} | Reviews: {reviews}\n"
214
+ output += "\n"
215
+
216
+ if 'address' in row and pd.notna(row['address']):
217
+ addr = str(row['address'])[:100]
218
+ output += f" Address: {addr}\n"
219
+
220
+ output += "\n"
221
+
222
+ return output, map_html
223
 
224
  # Create Gradio interface
225
  with gr.Blocks(
 
243
 
244
  with gr.Column(scale=2):
245
  data_source = gr.Dropdown(
246
+ choices=["Michelin Guide", "Google", "Yelp"],
247
  value="Yelp",
248
  label="Data Source",
249
  info="Select restaurant data source"
main.py CHANGED
@@ -4,6 +4,7 @@ import nltk
4
  import benepar
5
  import pandas as pd
6
  import numpy as np
 
7
  from sklearn.metrics.pairwise import cosine_similarity
8
 
9
  from utils.clean_text import clean_text
@@ -30,7 +31,11 @@ with open("data/restaurant_by_source.json", "r") as f:
30
  restaurant_by_source = json.load(f)
31
 
32
  # Load precomputed TF-IDF features
33
- restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz")
 
 
 
 
34
 
35
  # Extract embeddings
36
  data["embedding"] = data["embedding"].apply(
@@ -44,25 +49,28 @@ encoder = Encoder()
44
  # Initialize syntactic parser
45
  parser = Parser()
46
 
47
- # Initialize TF-IDF vectorizer
48
- tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=True)
49
-
50
  def retrieve_candidates(query: str, n_candidates: int):
 
 
51
  # Encode query
 
52
  query_emb = encoder.encode([query]).cpu().numpy()
53
 
54
  # Semantic similarities
 
55
  desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0]
56
 
57
  # TF-IDF similarities
 
58
  tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features)
59
 
60
  # Syntactic similarities
 
61
  parsed_query = parser.parse_text(query)
62
  parsed_query = parser.subtree_set(parsed_query)
63
 
64
  syn_sims = []
65
- for trees_list in data["syntactic_tree"]:
66
  review_sims = []
67
  for review_tree_subs in trees_list:
68
  if review_tree_subs is None:
@@ -78,10 +86,13 @@ def retrieve_candidates(query: str, n_candidates: int):
78
  # Get top N candidates for Stage 2 reranking
79
  candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1]
80
 
 
 
81
  return candidates_idx
82
 
83
 
84
  def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None) -> list:
 
85
 
86
  # Get popularity scores for stage 1 candidates
87
  rerank_scores = data.loc[candidates_idx, "pop_score"].values
@@ -93,6 +104,12 @@ def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None)
93
  # Get restaurant_id for final recommendations
94
  restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist()
95
 
 
 
 
 
 
 
96
  return restaurant_ids
97
 
98
  def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30, data_source: str = None):
 
4
  import benepar
5
  import pandas as pd
6
  import numpy as np
7
+ from tqdm import tqdm
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
10
  from utils.clean_text import clean_text
 
31
  restaurant_by_source = json.load(f)
32
 
33
  # Load precomputed TF-IDF features
34
+ # restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz")
35
+
36
+ print("Computing TFIDF")
37
+ tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=False)
38
+ restaurant_tfidf_features = tfidf_vectorizer.compute_tfidf_matrix(data["review_text_clean"])
39
 
40
  # Extract embeddings
41
  data["embedding"] = data["embedding"].apply(
 
49
  # Initialize syntactic parser
50
  parser = Parser()
51
 
 
 
 
52
  def retrieve_candidates(query: str, n_candidates: int):
53
+ print(f"Retrieving {n_candidates} candidates...")
54
+
55
  # Encode query
56
+ print("[RETRIEVAL] Encoding query")
57
  query_emb = encoder.encode([query]).cpu().numpy()
58
 
59
  # Semantic similarities
60
+ print("[RETRIEVAL] Computing semantic similarities")
61
  desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0]
62
 
63
  # TF-IDF similarities
64
+ print("[RETRIEVAL] Computing TF-IDF")
65
  tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features)
66
 
67
  # Syntactic similarities
68
+ print("[RETRIEVAL] Computing syntactic similarities")
69
  parsed_query = parser.parse_text(query)
70
  parsed_query = parser.subtree_set(parsed_query)
71
 
72
  syn_sims = []
73
+ for trees_list in tqdm(data["syntactic_tree"], total=len(data), desc="[RETRIEVAL] Computing syntactic similarities"):
74
  review_sims = []
75
  for review_tree_subs in trees_list:
76
  if review_tree_subs is None:
 
86
  # Get top N candidates for Stage 2 reranking
87
  candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1]
88
 
89
+ print(f"[RETRIEVAL] Results: {candidates_idx}")
90
+
91
  return candidates_idx
92
 
93
 
94
  def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None) -> list:
95
+ print("Reranking...")
96
 
97
  # Get popularity scores for stage 1 candidates
98
  rerank_scores = data.loc[candidates_idx, "pop_score"].values
 
104
  # Get restaurant_id for final recommendations
105
  restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist()
106
 
107
+ # Filter to only data_source
108
+ print(f"[RERANK] Filtering to only source - {data_source}")
109
+ restaurant_by_source_set = set(restaurant_by_source[data_source])
110
+ restaurant_ids = [x for x in restaurant_ids if x in restaurant_by_source_set]
111
+
112
+ print(f"[RERANK] Final recommendations: {restaurant_ids}")
113
  return restaurant_ids
114
 
115
  def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30, data_source: str = None):