File size: 7,029 Bytes
c6769c2
 
 
 
 
 
 
 
 
d15ef89
c6769c2
d15ef89
 
 
 
c6769c2
d15ef89
c6769c2
d15ef89
 
 
 
 
 
 
 
c6769c2
d15ef89
 
 
 
 
 
 
 
 
 
 
c6769c2
d15ef89
 
 
 
 
 
 
 
 
c6769c2
d15ef89
 
 
 
c6769c2
d15ef89
 
 
 
 
 
 
 
8e4f8b3
 
 
d15ef89
8e4f8b3
 
 
d15ef89
8e4f8b3
d15ef89
8e4f8b3
 
 
 
 
 
 
 
 
d15ef89
8e4f8b3
d15ef89
8e4f8b3
c6769c2
8e4f8b3
 
 
 
 
 
 
 
 
 
d15ef89
8e4f8b3
 
d15ef89
c6769c2
d15ef89
c6769c2
d15ef89
c6769c2
d15ef89
 
 
 
 
 
 
 
 
c6769c2
d15ef89
 
 
 
 
c6769c2
d15ef89
 
c6769c2
d15ef89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6769c2
d15ef89
 
c6769c2
d15ef89
 
 
 
 
 
 
 
 
c6769c2
d15ef89
 
 
 
 
 
 
 
 
c6769c2
 
 
d15ef89
 
c6769c2
 
 
 
 
d15ef89
c6769c2
d15ef89
c6769c2
 
d15ef89
 
 
c6769c2
d15ef89
c6769c2
d15ef89
 
 
 
 
 
 
 
 
b42b068
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle
import os

class MovieRecommender:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        # 1. Load the Brain
        self.encoder = SentenceTransformer(model_name)
        self.d = 384 # Dimension for MiniLM
        
        # 2. Initialize Memory (Safety First)
        # We initialize them as empty so the code doesn't crash if accessed before loading
        self.index = None
        self.df = pd.DataFrame() # Replaces 'self.movies'

    def save(self, path='models/'):
        """Saves the index and metadata to disk."""
        if not os.path.exists(path):
            os.makedirs(path)
            
        # Save FAISS Index
        if self.index is not None:
            faiss.write_index(self.index, os.path.join(path, 'movie_index.faiss'))
        
        # Save Metadata DataFrame
        with open(os.path.join(path, 'metadata.pkl'), 'wb') as f:
            pickle.dump(self.df, f)

    def load(self, path='models/'):
        """Loads the brain from disk."""
        index_path = os.path.join(path, 'movie_index.faiss')
        meta_path = os.path.join(path, 'metadata.pkl')

        if os.path.exists(index_path):
            self.index = faiss.read_index(index_path)
        
        if os.path.exists(meta_path):
            with open(meta_path, 'rb') as f:
                self.df = pickle.load(f)

    def add_new_movie(self, movie_data):
        """Adds a single movie to the memory (used during ingest)."""
        # 1. Vectorize
        vector = self.encoder.encode([movie_data['soup']])
        faiss.normalize_L2(vector)
        
        # 2. Add to Index
        if self.index is None:
            self.index = faiss.IndexFlatL2(self.d)
        self.index.add(vector)
        
        # 3. Add to DataFrame
        new_row = pd.DataFrame([movie_data])
        if self.df.empty:
            self.df = new_row
        else:
            self.df = pd.concat([self.df, new_row], ignore_index=True)

    def get_banned_genres(self, query_text):
        """
        Returns a list of genres to BAN based on the user's vibe.
        """
        query_lower = query_text.lower()
        banned_genres = set() # Use a set to prevent duplicates

        # 1. HAPPY / COMEDY MODE
        if any(w in query_lower for w in ["happy", "uplifting", "comedy", "laugh", "cheerful", "funny"]):
            banned_genres.update(["Horror", "Thriller", "War", "Crime", "Tragedy"])

        # 2. FAMILY / KIDS MODE
        # The trigger "family" is dangerous because of "Crime Family", so we check context.
        if any(w in query_lower for w in ["kid", "child", "animation", "disney", "pixar"]):
            banned_genres.update(["Horror", "Crime", "War", "Romance", "Adult"])
        
        # Special check for "Family" to avoid the "Crime Family" bug
        # We only trigger Family mode if "Crime" is NOT in the query.
        elif "family" in query_lower and "crime" not in query_lower:
             banned_genres.update(["Horror", "Crime", "War", "Romance", "Adult"])

        # 3. ROMANCE MODE
        if "romantic" in query_lower or "romance" in query_lower:
            banned_genres.update(["Horror"])

        # --- 🛡️ THE IMMUNITY RULE 🛡️ ---
        # If the user's query explicitly mentions a genre (e.g. "Crime Drama..."),
        # then that genre is IMMUNE. We must REMOVE it from the ban list.
        
        final_bans = []
        for ban in banned_genres:
            # If the banned genre is actually IN the query text, allow it.
            if ban.lower() in query_lower:
                continue 
            final_bans.append(ban)

        return final_bans
        
    def recommend(self, text_query, k=10):
        """
        Smart Recommendation with Guardrails
        """
        print(f"🔎 Searching for: '{text_query}'")
        
        if self.df.empty or self.index is None:
            return []

        # 1. Get user vector
        query_vector = self.encoder.encode([text_query])
        faiss.normalize_L2(query_vector)

        # 2. OVER-FETCH: Ask for 20 candidates (so we have spares if we delete some)
        distances, indices = self.index.search(query_vector, k=25)
        
        # 3. IDENTIFY BANS
        banned_genres = self.get_banned_genres(text_query)
        if banned_genres:
            print(f"🛡️  Guardrails Active! Banning: {banned_genres}")

        results = []
        seen_titles = set()
        
        for i, idx in enumerate(indices[0]):
            if idx == -1 or idx >= len(self.df): continue
            
            # --- SAFE ACCESS ---
            movie_data = self.df.iloc[idx].to_dict()
            
            # --- 4. FILTER LOGIC ---
            movie_soup = movie_data.get('soup', '').lower()
            
            is_banned = False
            for ban in banned_genres:
                if ban.lower() in movie_soup:
                    print(f"🚫 Blocking '{movie_data['title']}' (Contains {ban})")
                    is_banned = True
                    break
            
            if is_banned: continue
            # -----------------------

            # Deduplication
            if movie_data['title'] in seen_titles: continue
            
            results.append({
                'id': int(movie_data['id']),
                'title': movie_data['title'],
                'score': float(distances[0][i]),
            })
            seen_titles.add(movie_data['title'])

            if len(results) >= k:
                break
        
        return results

    def recommend_on_text(self, text_query, k=10):
        """Wrapper for the main recommend function."""
        return self.recommend(text_query, k)

    def recommend_for_user(self, liked_movie_titles, k=10):
        """Personalized Logic based on liked movies."""
        if self.df.empty: return []

        vectors = []
        for title in liked_movie_titles:
            # Search in self.df
            movie_row = self.df[self.df['title'].str.contains(title, case=False, na=False)]
            if not movie_row.empty:
                soup = movie_row.iloc[0]['soup']
                vectors.append(self.encoder.encode(soup))
        
        if not vectors:
            return []

        # Average the vectors
        user_vector = np.mean(vectors, axis=0)
        
        # Search using the user vector (reuse search logic manually here)
        user_vector = user_vector.reshape(1, -1)
        faiss.normalize_L2(user_vector)
        
        distances, indices = self.index.search(user_vector, k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1 and idx < len(self.df):
                movie_data = self.df.iloc[idx]
                results.append({
                    'id': int(movie_data['id']),
                    'title': movie_data['title'],
                    'score': float(distances[0][i])
                })
        return results