Spaces:
Running
Running
Upload 2 files
Browse files
app.py
CHANGED
|
@@ -191,17 +191,35 @@ def create_paris_map(results_df):
|
|
| 191 |
# import traceback
|
| 192 |
# return f"Error: {str(e)}\n\n{traceback.format_exc()}", None
|
| 193 |
|
| 194 |
-
# def search_restaurants(query, data_source, search_method, num_results, use_popularity):
|
| 195 |
-
# """Main search function that routes to appropriate search method"""
|
| 196 |
-
# if search_method == "Semantic Search" and use_semantic:
|
| 197 |
-
# return semantic_search(query, data_source, num_results, use_popularity)
|
| 198 |
-
# else:
|
| 199 |
-
# return keyword_search(query, data_source, num_results, use_popularity)
|
| 200 |
|
| 201 |
def search_restaurants(query_input, data_source, num_results):
|
| 202 |
n_candidates = 100
|
| 203 |
query_clean = clean_text(query_input)
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
# Create Gradio interface
|
| 207 |
with gr.Blocks(
|
|
@@ -225,7 +243,7 @@ with gr.Blocks(
|
|
| 225 |
|
| 226 |
with gr.Column(scale=2):
|
| 227 |
data_source = gr.Dropdown(
|
| 228 |
-
choices=["Michelin", "Google", "Yelp"],
|
| 229 |
value="Yelp",
|
| 230 |
label="Data Source",
|
| 231 |
info="Select restaurant data source"
|
|
|
|
| 191 |
# import traceback
|
| 192 |
# return f"Error: {str(e)}\n\n{traceback.format_exc()}", None
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
def search_restaurants(query_input, data_source, num_results):
|
| 196 |
n_candidates = 100
|
| 197 |
query_clean = clean_text(query_input)
|
| 198 |
+
restaurant_ids = get_recommendations(query_clean, n_candidates, num_results, data_source)
|
| 199 |
+
|
| 200 |
+
# Subset data for recommendedations
|
| 201 |
+
results = data[data["id"].isin(restaurant_ids)]
|
| 202 |
+
map_html = create_paris_map(results)
|
| 203 |
+
|
| 204 |
+
output = f"Found {len(results)} restaurants for '{query_input}'\n"
|
| 205 |
+
output += f"Data Source: {data_source}\n"
|
| 206 |
+
|
| 207 |
+
for idx, (_, row) in enumerate(results.iterrows(), 1):
|
| 208 |
+
name = row.get('name', 'Unknown')
|
| 209 |
+
rating = row.get('overall_rating', 'N/A')
|
| 210 |
+
reviews = row.get('review_count', 'N/A')
|
| 211 |
+
|
| 212 |
+
output += f"{idx}. **{name}**\n"
|
| 213 |
+
output += f" Rating: {rating} | Reviews: {reviews}\n"
|
| 214 |
+
output += "\n"
|
| 215 |
+
|
| 216 |
+
if 'address' in row and pd.notna(row['address']):
|
| 217 |
+
addr = str(row['address'])[:100]
|
| 218 |
+
output += f" Address: {addr}\n"
|
| 219 |
+
|
| 220 |
+
output += "\n"
|
| 221 |
+
|
| 222 |
+
return output, map_html
|
| 223 |
|
| 224 |
# Create Gradio interface
|
| 225 |
with gr.Blocks(
|
|
|
|
| 243 |
|
| 244 |
with gr.Column(scale=2):
|
| 245 |
data_source = gr.Dropdown(
|
| 246 |
+
choices=["Michelin Guide", "Google", "Yelp"],
|
| 247 |
value="Yelp",
|
| 248 |
label="Data Source",
|
| 249 |
info="Select restaurant data source"
|
main.py
CHANGED
|
@@ -4,6 +4,7 @@ import nltk
|
|
| 4 |
import benepar
|
| 5 |
import pandas as pd
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
|
| 9 |
from utils.clean_text import clean_text
|
|
@@ -30,7 +31,11 @@ with open("data/restaurant_by_source.json", "r") as f:
|
|
| 30 |
restaurant_by_source = json.load(f)
|
| 31 |
|
| 32 |
# Load precomputed TF-IDF features
|
| 33 |
-
restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Extract embeddings
|
| 36 |
data["embedding"] = data["embedding"].apply(
|
|
@@ -44,25 +49,28 @@ encoder = Encoder()
|
|
| 44 |
# Initialize syntactic parser
|
| 45 |
parser = Parser()
|
| 46 |
|
| 47 |
-
# Initialize TF-IDF vectorizer
|
| 48 |
-
tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=True)
|
| 49 |
-
|
| 50 |
def retrieve_candidates(query: str, n_candidates: int):
|
|
|
|
|
|
|
| 51 |
# Encode query
|
|
|
|
| 52 |
query_emb = encoder.encode([query]).cpu().numpy()
|
| 53 |
|
| 54 |
# Semantic similarities
|
|
|
|
| 55 |
desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0]
|
| 56 |
|
| 57 |
# TF-IDF similarities
|
|
|
|
| 58 |
tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features)
|
| 59 |
|
| 60 |
# Syntactic similarities
|
|
|
|
| 61 |
parsed_query = parser.parse_text(query)
|
| 62 |
parsed_query = parser.subtree_set(parsed_query)
|
| 63 |
|
| 64 |
syn_sims = []
|
| 65 |
-
for trees_list in data["syntactic_tree"]:
|
| 66 |
review_sims = []
|
| 67 |
for review_tree_subs in trees_list:
|
| 68 |
if review_tree_subs is None:
|
|
@@ -78,10 +86,13 @@ def retrieve_candidates(query: str, n_candidates: int):
|
|
| 78 |
# Get top N candidates for Stage 2 reranking
|
| 79 |
candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1]
|
| 80 |
|
|
|
|
|
|
|
| 81 |
return candidates_idx
|
| 82 |
|
| 83 |
|
| 84 |
def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None) -> list:
|
|
|
|
| 85 |
|
| 86 |
# Get popularity scores for stage 1 candidates
|
| 87 |
rerank_scores = data.loc[candidates_idx, "pop_score"].values
|
|
@@ -93,6 +104,12 @@ def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None)
|
|
| 93 |
# Get restaurant_id for final recommendations
|
| 94 |
restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist()
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
return restaurant_ids
|
| 97 |
|
| 98 |
def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30, data_source: str = None):
|
|
|
|
| 4 |
import benepar
|
| 5 |
import pandas as pd
|
| 6 |
import numpy as np
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
|
| 10 |
from utils.clean_text import clean_text
|
|
|
|
| 31 |
restaurant_by_source = json.load(f)
|
| 32 |
|
| 33 |
# Load precomputed TF-IDF features
|
| 34 |
+
# restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz")
|
| 35 |
+
|
| 36 |
+
print("Computing TFIDF")
|
| 37 |
+
tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=False)
|
| 38 |
+
restaurant_tfidf_features = tfidf_vectorizer.compute_tfidf_matrix(data["review_text_clean"])
|
| 39 |
|
| 40 |
# Extract embeddings
|
| 41 |
data["embedding"] = data["embedding"].apply(
|
|
|
|
| 49 |
# Initialize syntactic parser
|
| 50 |
parser = Parser()
|
| 51 |
|
|
|
|
|
|
|
|
|
|
| 52 |
def retrieve_candidates(query: str, n_candidates: int):
|
| 53 |
+
print(f"Retrieving {n_candidates} candidates...")
|
| 54 |
+
|
| 55 |
# Encode query
|
| 56 |
+
print("[RETRIEVAL] Encoding query")
|
| 57 |
query_emb = encoder.encode([query]).cpu().numpy()
|
| 58 |
|
| 59 |
# Semantic similarities
|
| 60 |
+
print("[RETRIEVAL] Computing semantic similarities")
|
| 61 |
desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0]
|
| 62 |
|
| 63 |
# TF-IDF similarities
|
| 64 |
+
print("[RETRIEVAL] Computing TF-IDF")
|
| 65 |
tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features)
|
| 66 |
|
| 67 |
# Syntactic similarities
|
| 68 |
+
print("[RETRIEVAL] Computing syntactic similarities")
|
| 69 |
parsed_query = parser.parse_text(query)
|
| 70 |
parsed_query = parser.subtree_set(parsed_query)
|
| 71 |
|
| 72 |
syn_sims = []
|
| 73 |
+
for trees_list in tqdm(data["syntactic_tree"], total=len(data), desc="[RETRIEVAL] Computing syntactic similarities"):
|
| 74 |
review_sims = []
|
| 75 |
for review_tree_subs in trees_list:
|
| 76 |
if review_tree_subs is None:
|
|
|
|
| 86 |
# Get top N candidates for Stage 2 reranking
|
| 87 |
candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1]
|
| 88 |
|
| 89 |
+
print(f"[RETRIEVAL] Results: {candidates_idx}")
|
| 90 |
+
|
| 91 |
return candidates_idx
|
| 92 |
|
| 93 |
|
| 94 |
def rerank(candidates_idx: np.ndarray, n_rec: int = 10, data_source: str = None) -> list:
|
| 95 |
+
print("Reranking...")
|
| 96 |
|
| 97 |
# Get popularity scores for stage 1 candidates
|
| 98 |
rerank_scores = data.loc[candidates_idx, "pop_score"].values
|
|
|
|
| 104 |
# Get restaurant_id for final recommendations
|
| 105 |
restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist()
|
| 106 |
|
| 107 |
+
# Filter to only data_source
|
| 108 |
+
print(f"[RERANK] Filtering to only source - {data_source}")
|
| 109 |
+
restaurant_by_source_set = set(restaurant_by_source[data_source])
|
| 110 |
+
restaurant_ids = [x for x in restaurant_ids if x in restaurant_by_source_set]
|
| 111 |
+
|
| 112 |
+
print(f"[RERANK] Final recommendations: {restaurant_ids}")
|
| 113 |
return restaurant_ids
|
| 114 |
|
| 115 |
def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30, data_source: str = None):
|