| import os |
| import requests |
| import time |
| import pandas as pd |
| import faiss |
| from dotenv import load_dotenv |
| from src.recommender import MovieRecommender |
|
|
| load_dotenv() |
| API_KEY = os.getenv("TMDB_API_KEY") |
| BASE_URL = "https://api.themoviedb.org/3" |
|
|
| def get_genre_map(): |
| """Fetch genre list once to avoid repeated calls.""" |
| url = f"{BASE_URL}/genre/movie/list" |
| params = {'api_key': API_KEY, 'language': 'en-US'} |
| try: |
| response = requests.get(url, params=params) |
| return {g['id']: g['name'] for g in response.json().get('genres', [])} |
| except: |
| return {} |
|
|
| def get_movie_details(movie_id): |
| """ |
| Fetches the 'Secret Sauce': Cast, Director, and Keywords. |
| """ |
| url = f"{BASE_URL}/movie/{movie_id}" |
| params = { |
| 'api_key': API_KEY, |
| 'append_to_response': 'credits,keywords' |
| } |
| |
| try: |
| r = requests.get(url, params=params) |
| if r.status_code != 200: return "", "", [] |
| |
| data = r.json() |
| |
| |
| crew = data.get('credits', {}).get('crew', []) |
| director = next((person['name'] for person in crew if person['job'] == 'Director'), "") |
| |
| |
| cast = data.get('credits', {}).get('cast', []) |
| top_cast = [person['name'] for person in cast[:4]] |
| |
| |
| keywords = [k['name'] for k in data.get('keywords', {}).get('keywords', [])[:6]] |
| |
| return director, top_cast, keywords |
| except: |
| return "", [], [] |
|
|
| def ingest_high_quality_movies(target_count=500, reset=True): |
| print(f"--- π Starting Super-Ingest (Target: {target_count}, Reset: {reset}) ---") |
| |
| rec = MovieRecommender() |
| existing_ids = set() |
|
|
| |
| if reset or not os.path.exists('models/metadata.pkl'): |
| print("β οΈ Mode: RESET. Creating fresh database...") |
| rec.index = faiss.IndexFlatL2(384) |
| else: |
| print("π₯ Mode: APPEND. Loading existing database...") |
| try: |
| rec.load('models') |
| |
| if not rec.df.empty: |
| existing_ids = set(rec.df['id'].tolist()) |
| print(f" Found {len(existing_ids)} existing movies.") |
| else: |
| print(" β οΈ Database loaded but appears empty.") |
| except Exception as e: |
| print(f" β Error loading existing DB: {e}. Starting fresh.") |
| rec.index = faiss.IndexFlatL2(384) |
| rec.df = pd.DataFrame() |
|
|
| genre_map = get_genre_map() |
| movies_added = 0 |
| page = 1 |
| |
| while movies_added < target_count: |
| |
| url = f"{BASE_URL}/discover/movie" |
| params = { |
| 'api_key': API_KEY, |
| 'language': 'en-US', |
| 'sort_by': 'popularity.desc', |
| 'vote_average.gte': 7.0, |
| 'vote_count.gte': 500, |
| 'page': page |
| } |
|
|
| try: |
| response = requests.get(url, params=params) |
| |
| |
| if response.status_code != 200: |
| print(f"β CRITICAL API ERROR: {response.status_code}") |
| print(f"Server Message: {response.text}") |
| break |
|
|
| results = response.json().get('results', []) |
| |
| if not results: |
| print("β οΈ API returned 200 OK, but 'results' list is empty.") |
| break |
|
|
| batch_added = 0 |
| for m in results: |
| if movies_added >= target_count: break |
| if m['id'] in existing_ids: continue |
|
|
| |
| director, cast, keywords = get_movie_details(m['id']) |
| |
| |
| genres = [genre_map.get(gid, '') for gid in m.get('genre_ids', [])] |
| |
| |
| soup = ( |
| f"{m['title']} {m['title']} " |
| f"Director: {director} " |
| f"Cast: {' '.join(cast)} " |
| f"Keywords: {' '.join(keywords)} " |
| f"Genres: {' '.join(genres)} " |
| f"{m.get('overview', '')}" |
| ) |
|
|
| rec.add_new_movie({ |
| 'id': m['id'], |
| 'title': m['title'], |
| 'soup': soup, |
| 'rating': m.get('vote_average') |
| }) |
| existing_ids.add(m['id']) |
| movies_added += 1 |
| batch_added += 1 |
| |
| time.sleep(0.05) |
|
|
| print(f"Page {page}: Added {batch_added} movies. (Total New: {movies_added})") |
| |
| if page > 50 and movies_added == 0: break |
| if movies_added >= target_count: break |
| page += 1 |
|
|
| except Exception as e: |
| print(f"Error: {e}") |
| break |
|
|
| print(f"--- Saving Super-Brain (Total Size: {len(existing_ids)}) ---") |
| rec.save('models/') |
|
|
| if __name__ == "__main__": |
| |
| is_github_action = os.getenv("GITHUB_ACTIONS") == "true" |
|
|
| if is_github_action: |
| print("π€ AUTOMATION DETECTED: Running Daily 800-Movie Reset.") |
| |
| ingest_high_quality_movies(target_count=800, reset=True) |
| else: |
| print("π¨βπ» LOCAL DEV DETECTED: Running Safe Test.") |
| |
| ingest_high_quality_movies(target_count=50, reset=False) |