import pandas as pd import numpy as np import faiss from sentence_transformers import SentenceTransformer import pickle import os class MovieRecommender: def __init__(self, model_name='all-MiniLM-L6-v2'): # 1. Load the Brain self.encoder = SentenceTransformer(model_name) self.d = 384 # Dimension for MiniLM # 2. Initialize Memory (Safety First) # We initialize them as empty so the code doesn't crash if accessed before loading self.index = None self.df = pd.DataFrame() # Replaces 'self.movies' def save(self, path='models/'): """Saves the index and metadata to disk.""" if not os.path.exists(path): os.makedirs(path) # Save FAISS Index if self.index is not None: faiss.write_index(self.index, os.path.join(path, 'movie_index.faiss')) # Save Metadata DataFrame with open(os.path.join(path, 'metadata.pkl'), 'wb') as f: pickle.dump(self.df, f) def load(self, path='models/'): """Loads the brain from disk.""" index_path = os.path.join(path, 'movie_index.faiss') meta_path = os.path.join(path, 'metadata.pkl') if os.path.exists(index_path): self.index = faiss.read_index(index_path) if os.path.exists(meta_path): with open(meta_path, 'rb') as f: self.df = pickle.load(f) def add_new_movie(self, movie_data): """Adds a single movie to the memory (used during ingest).""" # 1. Vectorize vector = self.encoder.encode([movie_data['soup']]) faiss.normalize_L2(vector) # 2. Add to Index if self.index is None: self.index = faiss.IndexFlatL2(self.d) self.index.add(vector) # 3. Add to DataFrame new_row = pd.DataFrame([movie_data]) if self.df.empty: self.df = new_row else: self.df = pd.concat([self.df, new_row], ignore_index=True) def get_banned_genres(self, query_text): """ Returns a list of genres to BAN based on the user's vibe. """ query_lower = query_text.lower() banned_genres = set() # Use a set to prevent duplicates # 1. HAPPY / COMEDY MODE if any(w in query_lower for w in ["happy", "uplifting", "comedy", "laugh", "cheerful", "funny"]): banned_genres.update(["Horror", "Thriller", "War", "Crime", "Tragedy"]) # 2. FAMILY / KIDS MODE # The trigger "family" is dangerous because of "Crime Family", so we check context. if any(w in query_lower for w in ["kid", "child", "animation", "disney", "pixar"]): banned_genres.update(["Horror", "Crime", "War", "Romance", "Adult"]) # Special check for "Family" to avoid the "Crime Family" bug # We only trigger Family mode if "Crime" is NOT in the query. elif "family" in query_lower and "crime" not in query_lower: banned_genres.update(["Horror", "Crime", "War", "Romance", "Adult"]) # 3. ROMANCE MODE if "romantic" in query_lower or "romance" in query_lower: banned_genres.update(["Horror"]) # --- 🛡️ THE IMMUNITY RULE 🛡️ --- # If the user's query explicitly mentions a genre (e.g. "Crime Drama..."), # then that genre is IMMUNE. We must REMOVE it from the ban list. final_bans = [] for ban in banned_genres: # If the banned genre is actually IN the query text, allow it. if ban.lower() in query_lower: continue final_bans.append(ban) return final_bans def recommend(self, text_query, k=10): """ Smart Recommendation with Guardrails """ print(f"🔎 Searching for: '{text_query}'") if self.df.empty or self.index is None: return [] # 1. Get user vector query_vector = self.encoder.encode([text_query]) faiss.normalize_L2(query_vector) # 2. OVER-FETCH: Ask for 20 candidates (so we have spares if we delete some) distances, indices = self.index.search(query_vector, k=25) # 3. IDENTIFY BANS banned_genres = self.get_banned_genres(text_query) if banned_genres: print(f"🛡️ Guardrails Active! Banning: {banned_genres}") results = [] seen_titles = set() for i, idx in enumerate(indices[0]): if idx == -1 or idx >= len(self.df): continue # --- SAFE ACCESS --- movie_data = self.df.iloc[idx].to_dict() # --- 4. FILTER LOGIC --- movie_soup = movie_data.get('soup', '').lower() is_banned = False for ban in banned_genres: if ban.lower() in movie_soup: print(f"🚫 Blocking '{movie_data['title']}' (Contains {ban})") is_banned = True break if is_banned: continue # ----------------------- # Deduplication if movie_data['title'] in seen_titles: continue results.append({ 'id': int(movie_data['id']), 'title': movie_data['title'], 'score': float(distances[0][i]), }) seen_titles.add(movie_data['title']) if len(results) >= k: break return results def recommend_on_text(self, text_query, k=10): """Wrapper for the main recommend function.""" return self.recommend(text_query, k) def recommend_for_user(self, liked_movie_titles, k=10): """Personalized Logic based on liked movies.""" if self.df.empty: return [] vectors = [] for title in liked_movie_titles: # Search in self.df movie_row = self.df[self.df['title'].str.contains(title, case=False, na=False)] if not movie_row.empty: soup = movie_row.iloc[0]['soup'] vectors.append(self.encoder.encode(soup)) if not vectors: return [] # Average the vectors user_vector = np.mean(vectors, axis=0) # Search using the user vector (reuse search logic manually here) user_vector = user_vector.reshape(1, -1) faiss.normalize_L2(user_vector) distances, indices = self.index.search(user_vector, k) results = [] for i, idx in enumerate(indices[0]): if idx != -1 and idx < len(self.df): movie_data = self.df.iloc[idx] results.append({ 'id': int(movie_data['id']), 'title': movie_data['title'], 'score': float(distances[0][i]) }) return results