yigitcanozdemir
Refactor recommendation engine and similarity calculator: remove unnecessary print statements and update status messages for clarity.
a7dcca0
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| from typing import List, Dict, Any | |
| import time | |
| from config import QUALITY_LEVELS | |
| class SimilarityCalculator: | |
| def __init__(self, model: SentenceTransformer): | |
| self.model = model | |
| def combined_and_score(self, similarity_matrix, alpha=10): | |
| smooth_min = -torch.logsumexp(-alpha * similarity_matrix, dim=0) / alpha | |
| return smooth_min | |
| def calculate_similarity( | |
| self, features: str, filtered_data: pd.DataFrame, top_k: int = 40 | |
| ) -> Dict[str, Any]: | |
| if filtered_data.empty: | |
| return { | |
| "status": "No results found with current filters.", | |
| "results": [], | |
| "search_time": 0, | |
| "total_candidates": 0, | |
| } | |
| start_time = time.time() | |
| positive_themes = features.positive_themes | |
| negative_themes = features.negative_themes | |
| positive_query_embeddings_np = self.model.encode( | |
| positive_themes, convert_to_numpy=True | |
| ) | |
| positive_query_embeddings = torch.tensor( | |
| positive_query_embeddings_np, dtype=torch.float32 | |
| ) | |
| if ( | |
| positive_query_embeddings.dim() > 1 | |
| and positive_query_embeddings.shape[0] > 1 | |
| ): | |
| avg_positive = torch.mean(positive_query_embeddings, dim=0, keepdim=True) | |
| else: | |
| avg_positive = positive_query_embeddings | |
| document_embeddings = torch.tensor( | |
| np.array(filtered_data["embedding"].tolist()), dtype=torch.float32 | |
| ) | |
| if negative_themes is not None and len(negative_themes) > 0: | |
| negative_query_embeddings_np = self.model.encode( | |
| negative_themes, convert_to_numpy=True | |
| ) | |
| negative_query_embeddings = torch.tensor( | |
| negative_query_embeddings_np, dtype=torch.float32 | |
| ) | |
| if ( | |
| negative_query_embeddings.dim() > 1 | |
| and negative_query_embeddings.shape[0] > 1 | |
| ): | |
| avg_negative = torch.mean( | |
| negative_query_embeddings, dim=0, keepdim=True | |
| ) | |
| else: | |
| avg_negative = negative_query_embeddings | |
| positive_weight = 1.0 | |
| negative_influence = 0.6 | |
| combined_embedding = (positive_weight * avg_positive) - ( | |
| negative_influence * avg_negative | |
| ) | |
| else: | |
| combined_embedding = avg_positive | |
| similarities = self.model.similarity(combined_embedding, document_embeddings) | |
| similarities = similarities[0] | |
| quality_config = QUALITY_LEVELS.get(features.quality_level, {}) | |
| rating_weight = quality_config.get("rating_weight") | |
| hybrid_scores = self._calculate_hybrid_score( | |
| similarities, | |
| filtered_data, | |
| similarity_weight=1, | |
| rating_weight=rating_weight, | |
| genre_weight=0.3, | |
| ) | |
| top_indices = ( | |
| torch.topk(hybrid_scores, min(top_k, len(hybrid_scores))) | |
| .indices.cpu() | |
| .numpy() | |
| ) | |
| results = [] | |
| for idx in top_indices: | |
| original_idx = filtered_data.iloc[idx].name | |
| row = filtered_data.iloc[idx] | |
| result = { | |
| "tconst": row["tconst"], | |
| "title": row["primaryTitle"], | |
| "type": row["titleType"], | |
| "year": row["startYear"], | |
| "rating": row["averageRating"], | |
| "runtimeMinutes": row.get("runtimeMinutes", None), | |
| "votes": row["numVotes"], | |
| "genres": row["genres"], | |
| "overview": row["overview"], | |
| "similarity_score": float(similarities[idx]), | |
| "hybrid_score": float(hybrid_scores[idx]), | |
| "final_score": row["finalScore"], | |
| "genre_score": row["genreScore"], | |
| "poster_url": row["poster_url"], | |
| "country_of_origin": row["country_of_origin"], | |
| } | |
| results.append(result) | |
| end_time = time.time() | |
| search_time = end_time - start_time | |
| return { | |
| "status": "Search completed successfully.", | |
| "results": results, | |
| "search_time": search_time, | |
| "total_candidates": len(filtered_data), | |
| "query_embedding_shape": combined_embedding.shape, | |
| } | |
| def _calculate_hybrid_score( | |
| self, | |
| similarities: torch.Tensor, | |
| data: pd.DataFrame, | |
| similarity_weight: float = 0.8, | |
| rating_weight: float = 0.1, | |
| genre_weight: float = 0.2, | |
| ) -> torch.Tensor: | |
| sim_normalized = (similarities - similarities.min()) / ( | |
| similarities.max() - similarities.min() + 1e-8 | |
| ) | |
| if "finalScore" in data.columns: | |
| final_scores = torch.tensor(data["finalScore"].values, dtype=torch.float32) | |
| genre_score = torch.tensor(data["genreScore"].values, dtype=torch.float32) | |
| final_normalized = (final_scores - final_scores.min()) / ( | |
| final_scores.max() - final_scores.min() + 1e-8 | |
| ) | |
| total_weight = similarity_weight + rating_weight + genre_weight | |
| hybrid_score = ( | |
| similarity_weight * similarities | |
| + rating_weight * final_normalized | |
| + genre_weight * genre_score | |
| ) / total_weight | |
| return hybrid_score | |