Files changed (4) hide show
  1. build_engine.py +98 -0
  2. etl_pinecone.py +102 -0
  3. main.py +195 -0
  4. requirements.txt +7 -0
build_engine.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import pickle
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import ast
7
+
8
+ DATA_PATH = '../data/'
9
+ MAX_ITEMS = 12000
10
+
11
+ def process_movies():
12
+ print("🎬 Processing TMDB Movies...")
13
+ movies = pd.read_csv(DATA_PATH + 'tmdb_5000_movies.csv')
14
+
15
+ movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
16
+ movies = movies.dropna(subset=['release_date'])
17
+
18
+ movies = movies[
19
+ (movies['release_date'].dt.year >= 2000) |
20
+ ((movies['release_date'].dt.year < 2000) & (movies['vote_count'] > 1500))
21
+ ].copy()
22
+
23
+ def parse_genres(x):
24
+ try:
25
+ return " ".join([i['name'] for i in ast.literal_eval(x)])
26
+ except:
27
+ return ""
28
+
29
+ movies['genres_str'] = movies['genres'].apply(parse_genres)
30
+ movies['tags'] = movies['overview'].fillna('') + " " + movies['genres_str']
31
+ movies['type'] = 'Movie'
32
+ movies = movies[['id', 'title', 'tags', 'vote_average', 'vote_count', 'type', 'genres_str']]
33
+ movies.rename(columns={'vote_average': 'rating', 'genres_str': 'genre_list'}, inplace=True)
34
+
35
+ return movies
36
+
37
+ def process_anime():
38
+ print("🍙 Processing Anime...")
39
+ anime = pd.read_csv(DATA_PATH + 'anime.csv')
40
+
41
+ anime = anime[anime['members'] > 40000].copy()
42
+ anime['name'] = anime['name'].fillna('')
43
+ anime['genre'] = anime['genre'].fillna('')
44
+ anime['type'] = anime['type'].fillna('Anime')
45
+ anime['tags'] = anime['genre'] + " " + anime['type'] + " " + anime['name']
46
+ anime['genre_list'] = "Anime"
47
+ anime.rename(columns={'anime_id': 'id', 'name': 'title', 'rating': 'rating', 'members': 'vote_count'}, inplace=True)
48
+ anime['type'] = 'Anime'
49
+
50
+ anime = anime[['id', 'title', 'tags', 'rating', 'vote_count', 'type', 'genre_list']]
51
+
52
+ return anime
53
+
54
+ def build_engine():
55
+ df_movies = process_movies()
56
+ df_anime = process_anime()
57
+
58
+ combined = pd.concat([df_movies, df_anime], ignore_index=True)
59
+ combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)
60
+ if len(combined) > MAX_ITEMS:
61
+ print(f"⚠️ Trimming dataset from {len(combined)} to {MAX_ITEMS}...")
62
+ combined = combined.head(MAX_ITEMS)
63
+
64
+ print(f"📊 Total Database: {len(combined)} items.")
65
+ print("🧠 Training NLP Model...")
66
+ cv = CountVectorizer(max_features=5000, stop_words='english')
67
+ vectors = cv.fit_transform(combined['tags']).toarray()
68
+ print("📐 Calculating Cosine Similarity...")
69
+ similarity = cosine_similarity(vectors)
70
+ print("📝 Generating Quiz Data...")
71
+
72
+ all_genres = set()
73
+ for g in combined['genre_list'].dropna():
74
+ cleaned = g.replace(" ", ",").split(",")
75
+ for item in cleaned:
76
+ if item and len(item) > 2: all_genres.add(item.strip())
77
+
78
+ quiz_data = {}
79
+ for genre in all_genres:
80
+ if genre == "Anime":
81
+ mask = combined['type'] == 'Anime'
82
+ else:
83
+ mask = (combined['genre_list'].str.contains(genre, case=False, na=False)) & (combined['type'] == 'Movie')
84
+
85
+ top_items = combined[mask].sort_values(by='rating', ascending=False).head(20)
86
+
87
+ if not top_items.empty:
88
+ quiz_data[genre] = top_items[['id', 'title', 'type']].to_dict('records')
89
+
90
+ print("💾 Saving Artifacts...")
91
+ pickle.dump(combined, open('movie_list.pkl', 'wb'))
92
+ pickle.dump(similarity, open('similarity.pkl', 'wb'))
93
+ pickle.dump(quiz_data, open('quiz_data.pkl', 'wb'))
94
+
95
+ print("🎉 DONE! Backend ready.")
96
+
97
+ if __name__ == "__main__":
98
+ build_engine()
etl_pinecone.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sentence_transformers import SentenceTransformer
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ from tqdm import tqdm
6
+ import time
7
+
8
+ PINECONE_API_KEY = "pcsk_5tHsyD_Ewe6CLcGWckB2mCAsMuy1E2YDosgMWSt1itcBh1q5PxgmpmNymK4jpX7byrBZgd"
9
+ INDEX_NAME = "cine-match"
10
+ DATA_PATH = '../data/'
11
+ MAX_ITEMS = 40000
12
+
13
+ def prepare_data():
14
+ print("📂 Loading Datasets...")
15
+
16
+ movies = pd.read_csv(DATA_PATH + 'movies_metadata.csv', low_memory=False)
17
+
18
+ movies = movies[movies['release_date'].notna()]
19
+ movies = movies[movies['vote_count'].notna()]
20
+
21
+ movies['vote_count'] = pd.to_numeric(movies['vote_count'], errors='coerce')
22
+ movies['vote_average'] = pd.to_numeric(movies['vote_average'], errors='coerce')
23
+ movies['popularity'] = pd.to_numeric(movies['popularity'], errors='coerce')
24
+
25
+ movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
26
+ movies = movies[
27
+ (movies['vote_count'] > 50) &
28
+ (movies['release_date'].dt.year >= 1980)
29
+ ].copy()
30
+
31
+ movies['overview'] = movies['overview'].fillna('')
32
+ movies['title'] = movies['title'].fillna('')
33
+ movies['text_chunk'] = "Movie: " + movies['title'] + ". Plot: " + movies['overview']
34
+
35
+ movies['type'] = 'Movie'
36
+ movies['image_id'] = movies['imdb_id']
37
+ movies = movies[['id', 'title', 'text_chunk', 'type', 'vote_count', 'vote_average']]
38
+
39
+ print(f"✅ Movies Processed: {len(movies)}")
40
+
41
+ anime = pd.read_csv(DATA_PATH + 'anime.csv')
42
+ anime = anime[anime['members'] > 10000]
43
+ anime['type'] = 'Anime'
44
+
45
+ anime['name'] = anime['name'].fillna('')
46
+ anime['genre'] = anime['genre'].fillna('')
47
+ anime['text_chunk'] = "Anime: " + anime['name'] + ". Genres: " + anime['genre'] + ". Type: " + anime['type']
48
+
49
+ anime.rename(columns={'anime_id': 'id', 'name': 'title', 'rating': 'vote_average', 'members': 'vote_count'}, inplace=True)
50
+ anime['image_id'] = anime['id']
51
+ anime = anime[['id', 'title', 'text_chunk', 'type', 'vote_count', 'vote_average']]
52
+
53
+ print(f"✅ Anime Processed: {len(anime)}")
54
+
55
+ combined = pd.concat([movies, anime], ignore_index=True)
56
+
57
+ combined = combined.sort_values(by='vote_count', ascending=False).head(MAX_ITEMS)
58
+
59
+ print(f"🔥 Final Database Size: {len(combined)} items.")
60
+ return combined
61
+
62
+ def upload_to_pinecone(df):
63
+ print("🧠 Loading AI Model (all-MiniLM-L6-v2)...")
64
+ model = SentenceTransformer('all-MiniLM-L6-v2')
65
+
66
+ print("☁️ Connecting to Pinecone...")
67
+ pc = Pinecone(api_key=PINECONE_API_KEY)
68
+ index = pc.Index(INDEX_NAME)
69
+
70
+ batch_size = 100
71
+ total_batches = len(df) // batch_size + 1
72
+
73
+ print("🚀 Starting Upload... (This will take a while!)")
74
+
75
+ for i in tqdm(range(0, len(df), batch_size)):
76
+ batch = df.iloc[i : i + batch_size]
77
+
78
+ vectors = model.encode(batch['text_chunk'].tolist()).tolist()
79
+
80
+ upsert_data = []
81
+ for j, row in enumerate(batch.itertuples()):
82
+ upsert_data.append({
83
+ "id": f"{row.type}_{row.id}",
84
+ "values": vectors[j],
85
+ "metadata": {
86
+ "title": str(row.title),
87
+ "type": str(row.type),
88
+ "original_id": str(row.id),
89
+ "rating": float(row.vote_average) if pd.notna(row.vote_average) else 0.0
90
+ }
91
+ })
92
+
93
+ try:
94
+ index.upsert(vectors=upsert_data)
95
+ except Exception as e:
96
+ print(f"Error uploading batch: {e}")
97
+
98
+ print("🎉 SUCCESS! All data is now in the Cloud Brain.")
99
+
100
+ if __name__ == "__main__":
101
+ df = prepare_data()
102
+ upload_to_pinecone(df)
main.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pinecone import Pinecone
5
+ from sentence_transformers import SentenceTransformer
6
+ import random
7
+ import os
8
+
9
+ # ============================
10
+ # 🔑 CONFIGURATION
11
+ # ============================
12
+ PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
13
+ INDEX_NAME = "cine-match"
14
+
15
+ if not PINECONE_API_KEY:
16
+ env_path = os.path.join(os.path.dirname(__file__), ".env")
17
+ if os.path.exists(env_path):
18
+ with open(env_path, "r", encoding="utf-8") as f:
19
+ for line in f:
20
+ if line.strip().startswith("PINECONE_API_KEY"):
21
+ parts = line.split("=", 1)
22
+ if len(parts) > 1:
23
+ PINECONE_API_KEY = parts[1].strip().strip('"').strip("'")
24
+ break
25
+
26
+ if not PINECONE_API_KEY:
27
+ raise RuntimeError(
28
+ "PINECONE_API_KEY not set. Add it to environment or ml-engine/.env"
29
+ )
30
+
31
+ app = FastAPI()
32
+
33
+ app.add_middleware(
34
+ CORSMiddleware,
35
+ allow_origins=["*"],
36
+ allow_methods=["*"],
37
+ allow_headers=["*"],
38
+ )
39
+
40
+ print("⏳ Loading AI Model...")
41
+ model = SentenceTransformer('all-MiniLM-L6-v2')
42
+ pc = Pinecone(api_key=PINECONE_API_KEY)
43
+ index = pc.Index(INDEX_NAME)
44
+ print("✅ Brain Online!")
45
+
46
+ # ============================
47
+ # 🛠 MODELS
48
+ # ============================
49
+
50
+ class SearchRequest(BaseModel):
51
+ query: str
52
+ filter_type: str = "All"
53
+
54
+ class QuizRequest(BaseModel):
55
+ genre: str
56
+
57
+ class FinalRecommendationRequest(BaseModel):
58
+ mood: str
59
+ selected_titles: list[str]
60
+ genre: str
61
+
62
+ # ============================
63
+ # 🔍 MODE 1: SIMPLE SEARCH
64
+ # ============================
65
+
66
+ @app.post("/search")
67
+ def semantic_search(req: SearchRequest):
68
+ try:
69
+ query_vector = model.encode(req.query).tolist()
70
+ filter_dict = {}
71
+ if req.filter_type != "All":
72
+ filter_dict = {"type": req.filter_type}
73
+
74
+ results = index.query(
75
+ vector=query_vector,
76
+ top_k=20,
77
+ include_metadata=True,
78
+ filter=filter_dict if filter_dict else None
79
+ )
80
+
81
+ matches = []
82
+ for match in results['matches']:
83
+ meta = match['metadata']
84
+ matches.append({
85
+ "id": meta['original_id'],
86
+ "title": meta['title'],
87
+ "type": meta['type'],
88
+ "score": match['score'],
89
+ "rating": meta.get('rating', 0)
90
+ })
91
+ return {"results": matches}
92
+ except Exception as e:
93
+ raise HTTPException(status_code=500, detail=str(e))
94
+
95
+ @app.post("/mood")
96
+ def mood_search(mood: str):
97
+ # Simple mapping for the "Search Mode" mood buttons
98
+ mood_map = {
99
+ "Happy": "Feel good movie, comedy, lighthearted, happy ending",
100
+ "Dark": "Dark, psychological thriller, disturbing, gritty, noir",
101
+ "Adrenaline": "High stakes action, fast paced, car chases",
102
+ "Mind-Bending": "Confusing plot, time travel, philosophy, deep thoughts",
103
+ "Romantic": "Love story, romance, heartbreak",
104
+ "Scary": "Horror, ghosts, jump scares"
105
+ }
106
+ search_query = mood_map.get(mood, mood)
107
+ return semantic_search(SearchRequest(query=search_query))
108
+
109
+ # ============================
110
+ # 🧙‍♂️ MODE 2: WIZARD / HYBRID
111
+ # ============================
112
+
113
+ @app.post("/get-quiz-items")
114
+ def get_quiz_items(req: QuizRequest):
115
+ query = f"Popular, famous, high rated {req.genre} movies or anime"
116
+ vector = model.encode(query).tolist()
117
+
118
+ results = index.query(
119
+ vector=vector,
120
+ top_k=20,
121
+ include_metadata=True,
122
+ filter={"type": "Anime" if req.genre == "Anime" else "Movie"}
123
+ )
124
+
125
+ items = []
126
+ for match in results['matches']:
127
+ meta = match['metadata']
128
+ items.append({
129
+ "id": meta['original_id'],
130
+ "title": meta['title'],
131
+ "type": meta['type'],
132
+ "poster": None
133
+ })
134
+ return {"items": items}
135
+
136
+ @app.post("/hybrid-recommend")
137
+ def hybrid_recommend(req: FinalRecommendationRequest):
138
+ joined_titles = ", ".join(req.selected_titles)
139
+ semantic_query = f"{req.mood} {req.genre} similar to {joined_titles}"
140
+
141
+ query_vector = model.encode(semantic_query).tolist()
142
+
143
+ results = index.query(
144
+ vector=query_vector,
145
+ top_k=60,
146
+ include_metadata=True
147
+ )
148
+
149
+ recommendations = []
150
+ for match in results['matches']:
151
+ meta = match['metadata']
152
+ if meta['title'] in req.selected_titles: continue
153
+
154
+ reason = f"Because you liked {random.choice(req.selected_titles)} and wanted something {req.mood}."
155
+
156
+ recommendations.append({
157
+ "id": meta['original_id'],
158
+ "title": meta['title'],
159
+ "type": meta['type'],
160
+ "score": match['score'],
161
+ "rating": meta.get('rating', 0),
162
+ "reason": reason
163
+ })
164
+
165
+ return {"results": recommendations}
166
+
167
+ @app.get("/lucky")
168
+ def lucky_pick():
169
+ """
170
+ Picks a random high-rated movie from the database.
171
+ """
172
+ # Query for generally good movies
173
+ vector = model.encode("Masterpiece, highly rated, famous, classic, 5 stars").tolist()
174
+
175
+ # Get 50 candidates
176
+ results = index.query(
177
+ vector=vector,
178
+ top_k=50,
179
+ include_metadata=True
180
+ )
181
+
182
+ if not results['matches']:
183
+ raise HTTPException(status_code=404, detail="No movies found")
184
+
185
+ # Pick one random movie
186
+ match = random.choice(results['matches'])
187
+ meta = match['metadata']
188
+
189
+ return {
190
+ "id": meta['original_id'],
191
+ "title": meta['title'],
192
+ "type": meta['type'],
193
+ "rating": meta.get('rating', 0),
194
+ "reason": "Serendipity ✨"
195
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ pinecone-client
5
+ sentence-transformers
6
+ torch
7
+ numpy