File size: 7,029 Bytes
c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 8e4f8b3 d15ef89 8e4f8b3 d15ef89 8e4f8b3 d15ef89 8e4f8b3 d15ef89 8e4f8b3 d15ef89 8e4f8b3 c6769c2 8e4f8b3 d15ef89 8e4f8b3 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 c6769c2 d15ef89 b42b068 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 | import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle
import os
class MovieRecommender:
def __init__(self, model_name='all-MiniLM-L6-v2'):
# 1. Load the Brain
self.encoder = SentenceTransformer(model_name)
self.d = 384 # Dimension for MiniLM
# 2. Initialize Memory (Safety First)
# We initialize them as empty so the code doesn't crash if accessed before loading
self.index = None
self.df = pd.DataFrame() # Replaces 'self.movies'
def save(self, path='models/'):
"""Saves the index and metadata to disk."""
if not os.path.exists(path):
os.makedirs(path)
# Save FAISS Index
if self.index is not None:
faiss.write_index(self.index, os.path.join(path, 'movie_index.faiss'))
# Save Metadata DataFrame
with open(os.path.join(path, 'metadata.pkl'), 'wb') as f:
pickle.dump(self.df, f)
def load(self, path='models/'):
"""Loads the brain from disk."""
index_path = os.path.join(path, 'movie_index.faiss')
meta_path = os.path.join(path, 'metadata.pkl')
if os.path.exists(index_path):
self.index = faiss.read_index(index_path)
if os.path.exists(meta_path):
with open(meta_path, 'rb') as f:
self.df = pickle.load(f)
def add_new_movie(self, movie_data):
"""Adds a single movie to the memory (used during ingest)."""
# 1. Vectorize
vector = self.encoder.encode([movie_data['soup']])
faiss.normalize_L2(vector)
# 2. Add to Index
if self.index is None:
self.index = faiss.IndexFlatL2(self.d)
self.index.add(vector)
# 3. Add to DataFrame
new_row = pd.DataFrame([movie_data])
if self.df.empty:
self.df = new_row
else:
self.df = pd.concat([self.df, new_row], ignore_index=True)
def get_banned_genres(self, query_text):
"""
Returns a list of genres to BAN based on the user's vibe.
"""
query_lower = query_text.lower()
banned_genres = set() # Use a set to prevent duplicates
# 1. HAPPY / COMEDY MODE
if any(w in query_lower for w in ["happy", "uplifting", "comedy", "laugh", "cheerful", "funny"]):
banned_genres.update(["Horror", "Thriller", "War", "Crime", "Tragedy"])
# 2. FAMILY / KIDS MODE
# The trigger "family" is dangerous because of "Crime Family", so we check context.
if any(w in query_lower for w in ["kid", "child", "animation", "disney", "pixar"]):
banned_genres.update(["Horror", "Crime", "War", "Romance", "Adult"])
# Special check for "Family" to avoid the "Crime Family" bug
# We only trigger Family mode if "Crime" is NOT in the query.
elif "family" in query_lower and "crime" not in query_lower:
banned_genres.update(["Horror", "Crime", "War", "Romance", "Adult"])
# 3. ROMANCE MODE
if "romantic" in query_lower or "romance" in query_lower:
banned_genres.update(["Horror"])
# --- 🛡️ THE IMMUNITY RULE 🛡️ ---
# If the user's query explicitly mentions a genre (e.g. "Crime Drama..."),
# then that genre is IMMUNE. We must REMOVE it from the ban list.
final_bans = []
for ban in banned_genres:
# If the banned genre is actually IN the query text, allow it.
if ban.lower() in query_lower:
continue
final_bans.append(ban)
return final_bans
def recommend(self, text_query, k=10):
"""
Smart Recommendation with Guardrails
"""
print(f"🔎 Searching for: '{text_query}'")
if self.df.empty or self.index is None:
return []
# 1. Get user vector
query_vector = self.encoder.encode([text_query])
faiss.normalize_L2(query_vector)
# 2. OVER-FETCH: Ask for 20 candidates (so we have spares if we delete some)
distances, indices = self.index.search(query_vector, k=25)
# 3. IDENTIFY BANS
banned_genres = self.get_banned_genres(text_query)
if banned_genres:
print(f"🛡️ Guardrails Active! Banning: {banned_genres}")
results = []
seen_titles = set()
for i, idx in enumerate(indices[0]):
if idx == -1 or idx >= len(self.df): continue
# --- SAFE ACCESS ---
movie_data = self.df.iloc[idx].to_dict()
# --- 4. FILTER LOGIC ---
movie_soup = movie_data.get('soup', '').lower()
is_banned = False
for ban in banned_genres:
if ban.lower() in movie_soup:
print(f"🚫 Blocking '{movie_data['title']}' (Contains {ban})")
is_banned = True
break
if is_banned: continue
# -----------------------
# Deduplication
if movie_data['title'] in seen_titles: continue
results.append({
'id': int(movie_data['id']),
'title': movie_data['title'],
'score': float(distances[0][i]),
})
seen_titles.add(movie_data['title'])
if len(results) >= k:
break
return results
def recommend_on_text(self, text_query, k=10):
"""Wrapper for the main recommend function."""
return self.recommend(text_query, k)
def recommend_for_user(self, liked_movie_titles, k=10):
"""Personalized Logic based on liked movies."""
if self.df.empty: return []
vectors = []
for title in liked_movie_titles:
# Search in self.df
movie_row = self.df[self.df['title'].str.contains(title, case=False, na=False)]
if not movie_row.empty:
soup = movie_row.iloc[0]['soup']
vectors.append(self.encoder.encode(soup))
if not vectors:
return []
# Average the vectors
user_vector = np.mean(vectors, axis=0)
# Search using the user vector (reuse search logic manually here)
user_vector = user_vector.reshape(1, -1)
faiss.normalize_L2(user_vector)
distances, indices = self.index.search(user_vector, k)
results = []
for i, idx in enumerate(indices[0]):
if idx != -1 and idx < len(self.df):
movie_data = self.df.iloc[idx]
results.append({
'id': int(movie_data['id']),
'title': movie_data['title'],
'score': float(distances[0][i])
})
return results
|