import torch import nltk import benepar import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from utils.clean_text import clean_text from utils.semantic_similarity import Encoder from utils.syntactic_similarity import Parser from utils.tfidf_similarity import TFIDF_Vectorizer # Set default device to CUDA if available, otherwise CPU if torch.cuda.is_available(): torch.set_default_device("cuda") else: torch.set_default_device("cpu") # Download models/data nltk.download('punkt') nltk.download('punkt_tab') benepar.download('benepar_en3_large') # Load dataset data = pd.read_csv("data/toy_data_aggregated_embeddings.csv") # Load precomputed TF-IDF features restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz") # Extract embeddings all_desc_embeddings = np.vstack(data["embedding"].values) # Initialize encoder encoder = Encoder() # Initialize syntactic parser parser = Parser() # Initialize TF-IDF vectorizer tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=True) def retrieve_candidates(query: str, n_candidates: int): # Encode query query_emb = encoder.encode([query]).cpu().numpy() # Semantic similarities desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0] # TF-IDF similarities tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features) # Syntactic similarities parsed_query = parser.parse_text(query) parsed_query = parser.subtree_set(parsed_query) syn_sims = [] for trees_list in data["syntactic_tree"]: review_sims = [] for review_tree_subs in trees_list: if review_tree_subs is None: review_tree_subs = set() sim = parser.compute_syntactic_similarity(parsed_query, review_tree_subs) review_sims.append(sim) syn_sims.append(np.mean(review_sims)) # Combined Stage 1 score syn_sims = np.array(syn_sims) combined_stage1_scores = 0.8*desc_sem_sim + 0.1*syn_sims + 0.1*tfidf_sim # Get top N candidates for Stage 2 reranking candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1] return candidates_idx def rerank(candidates_idx: np.ndarray, n_rec: int = 10, ) -> list: # Get popularity scores for stage 1 candidates rerank_scores = data.loc[candidates_idx, "pop_score"].values # Retrieve n_rec restaurant based on pop_score topN_reranked_local_idx = np.argsort(rerank_scores)[-n_rec:][::-1] topN_reranked_global_idx = candidates_idx[topN_reranked_local_idx] # Get restaurant_id for final recommendations restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist() return restaurant_ids def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30): query_clean = clean_text(query) candidates_idx = retrieve_candidates(query_clean, n_candidates) restaurant_ids = rerank(candidates_idx, n_rec) return restaurant_ids