Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

badminton001 commited on Jun 29, 2025

Commit

3cfe3ca

verified ·

1 Parent(s): 70722a2

Update retrieval/retrieve_movies_50000.py

Browse files

Files changed (1) hide show

retrieval/retrieve_movies_50000.py +219 -222

retrieval/retrieve_movies_50000.py CHANGED Viewed

@@ -1,222 +1,219 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import json
-import pickle
-import numpy as np
-import faiss
-from pathlib import Path
-from sentence_transformers import SentenceTransformer
-from scipy.sparse import load_npz
-from typing import List, Dict, Any, Optional, Tuple
-# Import custom utility functions
-# Ensure utils/query_parser.py is the latest version for accurate tag extraction
-from utils.query_parser import parse_user_query
-from utils.movies_explanation import generate_explanation
-# ── Path Configurations ──────────────────────────────────────────────
-# Define the root directory of the project (one level up from 'retrieval' folder)
-ROOT = Path(__file__).parent.parent
-# Path to vectorized data (TF-IDF matrix, SBERT embeddings, etc.)
-VEC_DIR = ROOT / "data" / "movie" / "vectorized"
-# Path to preprocessed movie records (updated for 50,000 records)
-PREPROCESSED_DATA_PATH = ROOT / "data" / "movie" / "preprocessed" / "movies_preprocessed_50000.json"
-# ── Load Preprocessed Data ───────────────────────────────────────────
-movie_records: List[Dict[str, Any]] = []
-try:
-    with open(PREPROCESSED_DATA_PATH, encoding="utf-8") as f:
-        movie_records = json.load(f)
-    print(f"Loaded {len(movie_records)} movie records.")
-except FileNotFoundError:
-    print(f"Error: Preprocessed movie data not found at {PREPROCESSED_DATA_PATH}")
-except json.JSONDecodeError:
-    print(f"Error: Could not decode JSON from {PREPROCESSED_DATA_PATH}")
-# ── Load TF-IDF Index and Vectorizer ─────────────────────────────────
-tfidf_vectorizer = None
-tfidf_matrix = np.array([])
-try:
-    # Updated TF-IDF asset filenames for 50,000 records
-    tfidf_vectorizer = pickle.load(open(VEC_DIR / "movies_tfidf_vectorizer_50000.pkl", "rb"))
-    tfidf_matrix = load_npz(VEC_DIR / "movies_tfidf_matrix_50000.npz").toarray().astype("float32")
-    faiss.normalize_L2(tfidf_matrix)
-    print("TF-IDF assets loaded.")
-except (FileNotFoundError, pickle.UnpicklingError, ValueError) as e:
-    print(f"Error loading TF-IDF assets: {e}")
-# ── Load SBERT Index and Model ───────────────────────────────────────
-sbert_embeddings = np.array([])
-sbert_model = None
-try:
-    # Updated SBERT asset filenames for 50,000 records
-    sbert_embeddings = np.array(pickle.load(open(VEC_DIR / "movies_sbert_embeddings_50000.pkl", "rb"))).astype("float32")
-    sbert_model_name = open(VEC_DIR / "movies_sbert_model_50000.txt").read().strip()
-    sbert_model = SentenceTransformer(sbert_model_name)
-    print("SBERT assets loaded.")
-except (FileNotFoundError, pickle.UnpicklingError, OSError) as e:
-    print(f"Error loading SBERT assets: {e}")
-# ── Main Recommendation Function ─────────────────────────────────────
-def get_recommendations(
-    query: str,
-    top_k: int = 5,
-    method: str = "sbert",
-    parsed_query_tags: Optional[Dict[str, Any]] = None # Parameter for parsed tags
-) -> List[Dict[str, Any]]:
-    """
-    Retrieves movie recommendations based on user query, with enhanced filtering and re-ranking.
-    Args:
-        query (str): The user's input query.
-        top_k (int): Number of top recommendations to return. Defaults to 5.
-        method (str): The retrieval method to use ("sbert" for semantic, "tfidf" for keyword-based).
-        parsed_query_tags (Optional[Dict[str, Any]]): Dictionary of parsed query tags (from query_parser.py).
-    Returns:
-        list: A list of dictionaries, where each dictionary represents a recommended movie
-              and includes its details, score, and an explanation.
-    """
-    if not movie_records:
-        print("Warning: Movie records not loaded. Returning empty list.")
-        return []
-    # Parse query if tags are not already provided (e.g., direct call from an external script)
-    if parsed_query_tags is None:
-        parsed_query_tags = parse_user_query(query)
-    # --- 1) Initial Candidate Selection (from full dataset) ---
-    # Retrieve more candidates than requested top_k to allow for strict filtering
-    CANDIDATE_MULTIPLIER = 20
-    initial_search_k = top_k * CANDIDATE_MULTIPLIER
-    hits: List[Tuple[int, float]] = [] # List of (original_index, similarity_score)
-    if method == "tfidf" and tfidf_matrix.size > 0 and tfidf_vectorizer:
-        query_vector = tfidf_vectorizer.transform([query]).toarray().astype("float32")
-        faiss.normalize_L2(query_vector)
-        faiss_idx_tfidf_full = faiss.IndexFlatIP(tfidf_matrix.shape[1])
-        faiss_idx_tfidf_full.add(tfidf_matrix)
-        distances, original_indices = faiss_idx_tfidf_full.search(query_vector, initial_search_k)
-        hits = [(idx, float(distances[0][j])) for j, idx in enumerate(original_indices[0])]
-    elif method == "sbert" and sbert_embeddings.size > 0 and sbert_model:
-        query_vector = sbert_model.encode([query], convert_to_numpy=True).astype("float32")
-        faiss_idx_sbert_full = faiss.IndexFlatL2(sbert_embeddings.shape[1])
-        faiss_idx_sbert_full.add(sbert_embeddings)
-        distances, original_indices = faiss_idx_sbert_full.search(query_vector, initial_search_k)
-        # For L2 distance, smaller is better, so negate to make larger scores better for sorting
-        hits = [(idx, -float(distances[0][j])) for j, idx in enumerate(original_indices[0])]
-    else:
-        print(f"Error: Invalid method '{method}' or required index/model is not available.")
-        return []
-    # --- 2) Filter and Re-rank based on parsed_query_tags ---
-    filtered_and_scored_results: List[Dict[str, Any]] = []
-    # Extract parsed query tags for easier access
-    target_genres = set(parsed_query_tags.get("genres", []))
-    target_moods = set(parsed_query_tags.get("mood", []))
-    target_audience = parsed_query_tags.get("target_audience")
-    target_era = parsed_query_tags.get("era")
-    target_decade = parsed_query_tags.get("decade")
-    specific_director = parsed_query_tags.get("specific_person") # Mapped to specific_person in parser
-    # Define moods that should trigger a "hard exclusion" if the user implies negativity
-    # This is a simple example; a more robust solution would involve sentiment analysis
-    negative_exclusion_moods = {"sad", "dark", "grim", "bleak", "depressing", "gloomy", "somber", "disturbing", "heavy", "angry", "chilling"}
-    for original_idx, base_score in hits:
-        movie_data = movie_records[original_idx].copy()
-        item_score = base_score # Start with the base similarity score from vector search
-        is_suitable = True     # Flag to mark if the movie meets all HARD filters
-        # --- HARD FILTERS (If any of these conditions are not met, the item is excluded) ---
-        # 1. Specific Director (Mandatory if requested)
-        if specific_director:
-            item_director = movie_data.get("director")
-            # Check for existence and then case-insensitive partial match
-            if not item_director or specific_director.lower() not in item_director.lower():
-                is_suitable = False # Exclude if specific director is requested but not found
-            else:
-                item_score += 0.5 # High boost for an exact or strong director match
-        # 2. Target Audience (Mandatory if requested)
-        if target_audience:
-            item_audience = movie_data.get("target_audience")
-            # If item has an audience tag and it doesn't match the target, exclude
-            if item_audience and item_audience != target_audience:
-                is_suitable = False
-        # 3. Era (Mandatory if requested and available in item data)
-        if target_era:
-            item_era = movie_data.get("era")
-            # Convert both to lower for case-insensitive comparison
-            if item_era and item_era.lower() != target_era.lower():
-                is_suitable = False
-        # 4. Decade (Mandatory if requested and able to be determined from item data)
-        if target_decade:
-            item_release_date = movie_data.get("release_date", "")
-            if item_release_date and len(item_release_date) >= 4:
-                item_year = int(item_release_date[:4]) # Extract year from release_date
-                # Calculate the decade of the movie's release year
-                item_decade_str = f"{(item_year // 10) * 10}s"
-                if item_decade_str != target_decade:
-                    is_suitable = False
-            else: # If no release date, it cannot match a specific decade, so exclude
-                is_suitable = False
-        # 5. Mood Exclusion (New Hard Filter): If user explicitly asks for a non-negative mood
-        # and an item has a negative mood, exclude it.
-        # For this example, we assume if ANY target mood is NOT in negative_exclusion_moods
-        # AND the movie has a negative_exclusion_mood, we exclude.
-        if target_moods and not any(m in negative_exclusion_moods for m in target_moods): # User wants a positive/neutral mood
-            item_moods = set(movie_data.get("mood", []))
-            if any(m in negative_exclusion_moods for m in item_moods): # Movie has a negative mood
-                is_suitable = False # Exclude if user avoids negative moods and movie is negative
-        # If any hard filter failed, this movie is not suitable, skip to the next candidate
-        if not is_suitable:
-            continue
-        # --- SOFT FILTERS (These conditions boost the score but do not strictly exclude) ---
-        # Only apply soft filters if the item passed all hard filters
-        # 1. Genres: Boost score based on the number of overlapping genres
-        if target_genres:
-            item_genres = set(movie_data.get("genres", []))
-            genre_matches = len(target_genres.intersection(item_genres))
-            item_score += 0.1 * genre_matches # Small boost for each matching genre
-        # 2. Moods: Boost score based on the number of overlapping moods (Increased weight for mood)
-        if target_moods:
-            item_moods = set(movie_data.get("mood", []))
-            mood_matches = len(target_moods.intersection(item_moods))
-            item_score += 0.2 * mood_matches # Increased boost for each matching mood, reflecting importance
-        # Add the movie to results if it passed all hard filters
-        # and include its calculated score
-        movie_data["score"] = item_score
-        filtered_and_scored_results.append(movie_data)
-    # Sort the results by the final calculated score (higher score is better)
-    # Using .get("score", -float('inf')) handles cases where 'score' might be missing (shouldn't happen here)
-    filtered_and_scored_results.sort(key=lambda x: x.get("score", -float('inf')), reverse=True)
-    # --- 3) Prepare final results ---
-    # Take only the top_k results after filtering and re-ranking
-    final_results = []
-    for item in filtered_and_scored_results[:top_k]:
-        # Generate a textual explanation for each recommendation
-        item["explanation"] = generate_explanation(parsed_query_tags, item)
-        final_results.append(item)
-    return final_results

+import json
+import pickle
+import numpy as np
+import faiss
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+from scipy.sparse import load_npz
+from typing import List, Dict, Any, Optional, Tuple
+# Import custom utility functions
+# Ensure utils/query_parser.py is the latest version for accurate tag extraction
+from utils.query_parser import parse_user_query
+from utils.movies_explanation import generate_explanation
+# ── Path Configurations ──────────────────────────────────────────────
+# Define the root directory of the project (one level up from 'retrieval' folder)
+ROOT = Path(__file__).parent.parent
+# Path to vectorized data (TF-IDF matrix, SBERT embeddings, etc.)
+VEC_DIR = ROOT / "data" / "movie" / "vectorized"
+# Path to preprocessed movie records (updated for 50,000 records)
+PREPROCESSED_DATA_PATH = ROOT / "data" / "movie" / "preprocessed" / "movies_preprocessed_50000.json"
+# ── Load Preprocessed Data ───────────────────────────────────────────
+movie_records: List[Dict[str, Any]] = []
+try:
+    with open(PREPROCESSED_DATA_PATH, encoding="utf-8") as f:
+        movie_records = json.load(f)
+    print(f"Loaded {len(movie_records)} movie records.")
+except FileNotFoundError:
+    print(f"Error: Preprocessed movie data not found at {PREPROCESSED_DATA_PATH}")
+except json.JSONDecodeError:
+    print(f"Error: Could not decode JSON from {PREPROCESSED_DATA_PATH}")
+# ── Load TF-IDF Index and Vectorizer ─────────────────────────────────
+tfidf_vectorizer = None
+tfidf_matrix = np.array([])
+try:
+    # Updated TF-IDF asset filenames for 50,000 records
+    tfidf_vectorizer = pickle.load(open(VEC_DIR / "movies_tfidf_vectorizer_50000.pkl", "rb"))
+    tfidf_matrix = load_npz(VEC_DIR / "movies_tfidf_matrix_50000.npz").toarray().astype("float32")
+    faiss.normalize_L2(tfidf_matrix)
+    print("TF-IDF assets loaded.")
+except (FileNotFoundError, pickle.UnpicklingError, ValueError) as e:
+    print(f"Error loading TF-IDF assets: {e}")
+# ── Load SBERT Index and Model ───────────────────────────────────────
+sbert_embeddings = np.array([])
+sbert_model = None
+try:
+    # Updated SBERT asset filenames for 50,000 records
+    sbert_embeddings = np.array(pickle.load(open(VEC_DIR / "movies_sbert_embeddings_50000.pkl", "rb"))).astype("float32")
+    sbert_model_name = open(VEC_DIR / "movies_sbert_model_50000.txt").read().strip()
+    sbert_model = SentenceTransformer(sbert_model_name)
+    print("SBERT assets loaded.")
+except (FileNotFoundError, pickle.UnpicklingError, OSError) as e:
+    print(f"Error loading SBERT assets: {e}")
+# ── Main Recommendation Function ─────────────────────────────────────
+def get_recommendations(
+    query: str,
+    top_k: int = 5,
+    method: str = "sbert",
+    parsed_query_tags: Optional[Dict[str, Any]] = None # Parameter for parsed tags
+) -> List[Dict[str, Any]]:
+    """
+    Retrieves movie recommendations based on user query, with enhanced filtering and re-ranking.
+    Args:
+        query (str): The user's input query.
+        top_k (int): Number of top recommendations to return. Defaults to 5.
+        method (str): The retrieval method to use ("sbert" for semantic, "tfidf" for keyword-based).
+        parsed_query_tags (Optional[Dict[str, Any]]): Dictionary of parsed query tags (from query_parser.py).
+    Returns:
+        list: A list of dictionaries, where each dictionary represents a recommended movie
+              and includes its details, score, and an explanation.
+    """
+    if not movie_records:
+        print("Warning: Movie records not loaded. Returning empty list.")
+        return []
+    # Parse query if tags are not already provided (e.g., direct call from an external script)
+    if parsed_query_tags is None:
+        parsed_query_tags = parse_user_query(query)
+    # --- 1) Initial Candidate Selection (from full dataset) ---
+    # Retrieve more candidates than requested top_k to allow for strict filtering
+    CANDIDATE_MULTIPLIER = 20
+    initial_search_k = top_k * CANDIDATE_MULTIPLIER
+    hits: List[Tuple[int, float]] = [] # List of (original_index, similarity_score)
+    if method == "tfidf" and tfidf_matrix.size > 0 and tfidf_vectorizer:
+        query_vector = tfidf_vectorizer.transform([query]).toarray().astype("float32")
+        faiss.normalize_L2(query_vector)
+        faiss_idx_tfidf_full = faiss.IndexFlatIP(tfidf_matrix.shape[1])
+        faiss_idx_tfidf_full.add(tfidf_matrix)
+        distances, original_indices = faiss_idx_tfidf_full.search(query_vector, initial_search_k)
+        hits = [(idx, float(distances[0][j])) for j, idx in enumerate(original_indices[0])]
+    elif method == "sbert" and sbert_embeddings.size > 0 and sbert_model:
+        query_vector = sbert_model.encode([query], convert_to_numpy=True).astype("float32")
+        faiss_idx_sbert_full = faiss.IndexFlatL2(sbert_embeddings.shape[1])
+        faiss_idx_sbert_full.add(sbert_embeddings)
+        distances, original_indices = faiss_idx_sbert_full.search(query_vector, initial_search_k)
+        # For L2 distance, smaller is better, so negate to make larger scores better for sorting
+        hits = [(idx, -float(distances[0][j])) for j, idx in enumerate(original_indices[0])]
+    else:
+        print(f"Error: Invalid method '{method}' or required index/model is not available.")
+        return []
+    # --- 2) Filter and Re-rank based on parsed_query_tags ---
+    filtered_and_scored_results: List[Dict[str, Any]] = []
+    # Extract parsed query tags for easier access
+    target_genres = set(parsed_query_tags.get("genres", []))
+    target_moods = set(parsed_query_tags.get("mood", []))
+    target_audience = parsed_query_tags.get("target_audience")
+    target_era = parsed_query_tags.get("era")
+    target_decade = parsed_query_tags.get("decade")
+    specific_director = parsed_query_tags.get("specific_person") # Mapped to specific_person in parser
+    # Define moods that should trigger a "hard exclusion" if the user implies negativity
+    # This is a simple example; a more robust solution would involve sentiment analysis
+    negative_exclusion_moods = {"sad", "dark", "grim", "bleak", "depressing", "gloomy", "somber", "disturbing", "heavy", "angry", "chilling"}
+    for original_idx, base_score in hits:
+        movie_data = movie_records[original_idx].copy()
+        item_score = base_score # Start with the base similarity score from vector search
+        is_suitable = True     # Flag to mark if the movie meets all HARD filters
+        # --- HARD FILTERS (If any of these conditions are not met, the item is excluded) ---
+        # 1. Specific Director (Mandatory if requested)
+        if specific_director:
+            item_director = movie_data.get("director")
+            # Check for existence and then case-insensitive partial match
+            if not item_director or specific_director.lower() not in item_director.lower():
+                is_suitable = False # Exclude if specific director is requested but not found
+            else:
+                item_score += 0.5 # High boost for an exact or strong director match
+        # 2. Target Audience (Mandatory if requested)
+        if target_audience:
+            item_audience = movie_data.get("target_audience")
+            # If item has an audience tag and it doesn't match the target, exclude
+            if item_audience and item_audience != target_audience:
+                is_suitable = False
+        # 3. Era (Mandatory if requested and available in item data)
+        if target_era:
+            item_era = movie_data.get("era")
+            # Convert both to lower for case-insensitive comparison
+            if item_era and item_era.lower() != target_era.lower():
+                is_suitable = False
+        # 4. Decade (Mandatory if requested and able to be determined from item data)
+        if target_decade:
+            item_release_date = movie_data.get("release_date", "")
+            if item_release_date and len(item_release_date) >= 4:
+                item_year = int(item_release_date[:4]) # Extract year from release_date
+                # Calculate the decade of the movie's release year
+                item_decade_str = f"{(item_year // 10) * 10}s"
+                if item_decade_str != target_decade:
+                    is_suitable = False
+            else: # If no release date, it cannot match a specific decade, so exclude
+                is_suitable = False
+        # 5. Mood Exclusion (New Hard Filter): If user explicitly asks for a non-negative mood
+        # and an item has a negative mood, exclude it.
+        # For this example, we assume if ANY target mood is NOT in negative_exclusion_moods
+        # AND the movie has a negative_exclusion_mood, we exclude.
+        if target_moods and not any(m in negative_exclusion_moods for m in target_moods): # User wants a positive/neutral mood
+            item_moods = set(movie_data.get("mood", []))
+            if any(m in negative_exclusion_moods for m in item_moods): # Movie has a negative mood
+                is_suitable = False # Exclude if user avoids negative moods and movie is negative
+        # If any hard filter failed, this movie is not suitable, skip to the next candidate
+        if not is_suitable:
+            continue
+        # --- SOFT FILTERS (These conditions boost the score but do not strictly exclude) ---
+        # Only apply soft filters if the item passed all hard filters
+        # 1. Genres: Boost score based on the number of overlapping genres
+        if target_genres:
+            item_genres = set(movie_data.get("genres", []))
+            genre_matches = len(target_genres.intersection(item_genres))
+            item_score += 0.1 * genre_matches # Small boost for each matching genre
+        # 2. Moods: Boost score based on the number of overlapping moods (Increased weight for mood)
+        if target_moods:
+            item_moods = set(movie_data.get("mood", []))
+            mood_matches = len(target_moods.intersection(item_moods))
+            item_score += 0.2 * mood_matches # Increased boost for each matching mood, reflecting importance
+        # Add the movie to results if it passed all hard filters
+        # and include its calculated score
+        movie_data["score"] = item_score
+        filtered_and_scored_results.append(movie_data)
+    # Sort the results by the final calculated score (higher score is better)
+    # Using .get("score", -float('inf')) handles cases where 'score' might be missing (shouldn't happen here)
+    filtered_and_scored_results.sort(key=lambda x: x.get("score", -float('inf')), reverse=True)
+    # --- 3) Prepare final results ---
+    # Take only the top_k results after filtering and re-ranking
+    final_results = []
+    for item in filtered_and_scored_results[:top_k]:
+        # Generate a textual explanation for each recommendation
+        item["explanation"] = generate_explanation(parsed_query_tags, item)
+        final_results.append(item)
+    return final_results