cinematch-api / build_engine.py
Leo
Upload 4 files (#1)
b8aefd2 verified
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
DATA_PATH = '../data/'
MAX_ITEMS = 12000
def process_movies():
print("🎬 Processing TMDB Movies...")
movies = pd.read_csv(DATA_PATH + 'tmdb_5000_movies.csv')
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies = movies.dropna(subset=['release_date'])
movies = movies[
(movies['release_date'].dt.year >= 2000) |
((movies['release_date'].dt.year < 2000) & (movies['vote_count'] > 1500))
].copy()
def parse_genres(x):
try:
return " ".join([i['name'] for i in ast.literal_eval(x)])
except:
return ""
movies['genres_str'] = movies['genres'].apply(parse_genres)
movies['tags'] = movies['overview'].fillna('') + " " + movies['genres_str']
movies['type'] = 'Movie'
movies = movies[['id', 'title', 'tags', 'vote_average', 'vote_count', 'type', 'genres_str']]
movies.rename(columns={'vote_average': 'rating', 'genres_str': 'genre_list'}, inplace=True)
return movies
def process_anime():
print("πŸ™ Processing Anime...")
anime = pd.read_csv(DATA_PATH + 'anime.csv')
anime = anime[anime['members'] > 40000].copy()
anime['name'] = anime['name'].fillna('')
anime['genre'] = anime['genre'].fillna('')
anime['type'] = anime['type'].fillna('Anime')
anime['tags'] = anime['genre'] + " " + anime['type'] + " " + anime['name']
anime['genre_list'] = "Anime"
anime.rename(columns={'anime_id': 'id', 'name': 'title', 'rating': 'rating', 'members': 'vote_count'}, inplace=True)
anime['type'] = 'Anime'
anime = anime[['id', 'title', 'tags', 'rating', 'vote_count', 'type', 'genre_list']]
return anime
def build_engine():
df_movies = process_movies()
df_anime = process_anime()
combined = pd.concat([df_movies, df_anime], ignore_index=True)
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)
if len(combined) > MAX_ITEMS:
print(f"⚠️ Trimming dataset from {len(combined)} to {MAX_ITEMS}...")
combined = combined.head(MAX_ITEMS)
print(f"πŸ“Š Total Database: {len(combined)} items.")
print("🧠 Training NLP Model...")
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(combined['tags']).toarray()
print("πŸ“ Calculating Cosine Similarity...")
similarity = cosine_similarity(vectors)
print("πŸ“ Generating Quiz Data...")
all_genres = set()
for g in combined['genre_list'].dropna():
cleaned = g.replace(" ", ",").split(",")
for item in cleaned:
if item and len(item) > 2: all_genres.add(item.strip())
quiz_data = {}
for genre in all_genres:
if genre == "Anime":
mask = combined['type'] == 'Anime'
else:
mask = (combined['genre_list'].str.contains(genre, case=False, na=False)) & (combined['type'] == 'Movie')
top_items = combined[mask].sort_values(by='rating', ascending=False).head(20)
if not top_items.empty:
quiz_data[genre] = top_items[['id', 'title', 'type']].to_dict('records')
print("πŸ’Ύ Saving Artifacts...")
pickle.dump(combined, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(quiz_data, open('quiz_data.pkl', 'wb'))
print("πŸŽ‰ DONE! Backend ready.")
if __name__ == "__main__":
build_engine()