team-149-project / main.py
knguyen471's picture
Upload 11 files
888aba6 verified
raw
history blame
3.05 kB
import torch
import nltk
import benepar
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from utils.clean_text import clean_text
from utils.semantic_similarity import Encoder
from utils.syntactic_similarity import Parser
from utils.tfidf_similarity import TFIDF_Vectorizer
# Set default device to CUDA if available, otherwise CPU
if torch.cuda.is_available():
torch.set_default_device("cuda")
else:
torch.set_default_device("cpu")
# Download models/data
nltk.download('punkt')
nltk.download('punkt_tab')
benepar.download('benepar_en3_large')
# Load dataset
data = pd.read_csv("data/toy_data_aggregated_embeddings.csv")
# Load precomputed TF-IDF features
restaurant_tfidf_features = np.load("data/toy_data_tfidf_features.npz")
# Extract embeddings
all_desc_embeddings = np.vstack(data["embedding"].values)
# Initialize encoder
encoder = Encoder()
# Initialize syntactic parser
parser = Parser()
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TFIDF_Vectorizer(load_vectorizer=True)
def retrieve_candidates(query: str, n_candidates: int):
# Encode query
query_emb = encoder.encode([query]).cpu().numpy()
# Semantic similarities
desc_sem_sim = cosine_similarity(query_emb, all_desc_embeddings)[0]
# TF-IDF similarities
tfidf_sim = tfidf_vectorizer.compute_tfidf_scores(query, restaurant_tfidf_features)
# Syntactic similarities
parsed_query = parser.parse_text(query)
parsed_query = parser.subtree_set(parsed_query)
syn_sims = []
for trees_list in data["syntactic_tree"]:
review_sims = []
for review_tree_subs in trees_list:
if review_tree_subs is None:
review_tree_subs = set()
sim = parser.compute_syntactic_similarity(parsed_query, review_tree_subs)
review_sims.append(sim)
syn_sims.append(np.mean(review_sims))
# Combined Stage 1 score
syn_sims = np.array(syn_sims)
combined_stage1_scores = 0.8*desc_sem_sim + 0.1*syn_sims + 0.1*tfidf_sim
# Get top N candidates for Stage 2 reranking
candidates_idx = np.argsort(combined_stage1_scores)[-n_candidates:][::-1]
return candidates_idx
def rerank(candidates_idx: np.ndarray, n_rec: int = 10, ) -> list:
# Get popularity scores for stage 1 candidates
rerank_scores = data.loc[candidates_idx, "pop_score"].values
# Retrieve n_rec restaurant based on pop_score
topN_reranked_local_idx = np.argsort(rerank_scores)[-n_rec:][::-1]
topN_reranked_global_idx = candidates_idx[topN_reranked_local_idx]
# Get restaurant_id for final recommendations
restaurant_ids = data.loc[topN_reranked_global_idx, "id"].tolist()
return restaurant_ids
def get_recommendations(query: str, n_candidates: int = 100, n_rec: int = 30):
query_clean = clean_text(query)
candidates_idx = retrieve_candidates(query_clean, n_candidates)
restaurant_ids = rerank(candidates_idx, n_rec)
return restaurant_ids