File size: 5,752 Bytes
5aea707 c6769c2 724a400 d15ef89 5aea707 93d63a5 d15ef89 c6769c2 5aea707 c6769c2 93d63a5 c6769c2 5aea707 c6769c2 93d63a5 d15ef89 93d63a5 5aea707 93d63a5 d15ef89 93d63a5 5aea707 d15ef89 93d63a5 d15ef89 93d63a5 d0ddfd2 93d63a5 5aea707 93d63a5 d15ef89 93d63a5 d15ef89 93d63a5 724a400 c6769c2 724a400 d15ef89 5aea707 c6769c2 724a400 5aea707 93d63a5 c6769c2 724a400 c6769c2 d15ef89 724a400 c6769c2 d15ef89 c6769c2 724a400 5aea707 c6769c2 d15ef89 93d63a5 d15ef89 5aea707 93d63a5 d15ef89 93d63a5 c6769c2 5aea707 724a400 5aea707 724a400 93d63a5 d15ef89 724a400 93d63a5 724a400 93d63a5 5aea707 724a400 5aea707 724a400 93d63a5 c6769c2 2c89290 d15ef89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | import os
import requests
import time
import pandas as pd
import faiss
from dotenv import load_dotenv
from src.recommender import MovieRecommender
load_dotenv()
API_KEY = os.getenv("TMDB_API_KEY")
BASE_URL = "https://api.themoviedb.org/3"
def get_genre_map():
"""Fetch genre list once to avoid repeated calls."""
url = f"{BASE_URL}/genre/movie/list"
params = {'api_key': API_KEY, 'language': 'en-US'}
try:
response = requests.get(url, params=params)
return {g['id']: g['name'] for g in response.json().get('genres', [])}
except:
return {}
def get_movie_details(movie_id):
"""
Fetches the 'Secret Sauce': Cast, Director, and Keywords.
"""
url = f"{BASE_URL}/movie/{movie_id}"
params = {
'api_key': API_KEY,
'append_to_response': 'credits,keywords'
}
try:
r = requests.get(url, params=params)
if r.status_code != 200: return "", "", []
data = r.json()
# 1. Director
crew = data.get('credits', {}).get('crew', [])
director = next((person['name'] for person in crew if person['job'] == 'Director'), "")
# 2. Cast (Top 4)
cast = data.get('credits', {}).get('cast', [])
top_cast = [person['name'] for person in cast[:4]]
# 3. Keywords (Top 6)
keywords = [k['name'] for k in data.get('keywords', {}).get('keywords', [])[:6]]
return director, top_cast, keywords
except:
return "", [], []
def ingest_high_quality_movies(target_count=500, reset=True):
print(f"--- 🌟 Starting Super-Ingest (Target: {target_count}, Reset: {reset}) ---")
rec = MovieRecommender()
existing_ids = set()
# --- SAFETY FIX: Robust Loading Logic ---
if reset or not os.path.exists('models/metadata.pkl'):
print("⚠️ Mode: RESET. Creating fresh database...")
rec.index = faiss.IndexFlatL2(384)
else:
print("📥 Mode: APPEND. Loading existing database...")
try:
rec.load('models')
# Check if 'df' exists and is not empty before reading
if not rec.df.empty:
existing_ids = set(rec.df['id'].tolist())
print(f" Found {len(existing_ids)} existing movies.")
else:
print(" ⚠️ Database loaded but appears empty.")
except Exception as e:
print(f" ❌ Error loading existing DB: {e}. Starting fresh.")
rec.index = faiss.IndexFlatL2(384)
rec.df = pd.DataFrame() # Reset dataframe
genre_map = get_genre_map()
movies_added = 0
page = 1
while movies_added < target_count:
# Discover movies
url = f"{BASE_URL}/discover/movie"
params = {
'api_key': API_KEY,
'language': 'en-US',
'sort_by': 'popularity.desc',
'vote_average.gte': 7.0,
'vote_count.gte': 500,
'page': page
}
try:
response = requests.get(url, params=params)
# --- DEBUG FIX: Catch API Errors ---
if response.status_code != 200:
print(f"❌ CRITICAL API ERROR: {response.status_code}")
print(f"Server Message: {response.text}")
break
results = response.json().get('results', [])
if not results:
print("⚠️ API returned 200 OK, but 'results' list is empty.")
break
batch_added = 0
for m in results:
if movies_added >= target_count: break
if m['id'] in existing_ids: continue
# Fetch details
director, cast, keywords = get_movie_details(m['id'])
# Genres
genres = [genre_map.get(gid, '') for gid in m.get('genre_ids', [])]
# Build Soup
soup = (
f"{m['title']} {m['title']} "
f"Director: {director} "
f"Cast: {' '.join(cast)} "
f"Keywords: {' '.join(keywords)} "
f"Genres: {' '.join(genres)} "
f"{m.get('overview', '')}"
)
rec.add_new_movie({
'id': m['id'],
'title': m['title'],
'soup': soup,
'rating': m.get('vote_average')
})
existing_ids.add(m['id'])
movies_added += 1
batch_added += 1
time.sleep(0.05) # Be nice to API
print(f"Page {page}: Added {batch_added} movies. (Total New: {movies_added})")
if page > 50 and movies_added == 0: break
if movies_added >= target_count: break
page += 1
except Exception as e:
print(f"Error: {e}")
break
print(f"--- Saving Super-Brain (Total Size: {len(existing_ids)}) ---")
rec.save('models/')
if __name__ == "__main__":
# Check if we are running inside GitHub Actions
is_github_action = os.getenv("GITHUB_ACTIONS") == "true"
if is_github_action:
print("🤖 AUTOMATION DETECTED: Running Daily 800-Movie Reset.")
# Daily Refresh: Get top 800, Wipe old data (Reset=True)
ingest_high_quality_movies(target_count=800, reset=True)
else:
print("👨💻 LOCAL DEV DETECTED: Running Safe Test.")
# Local Test: Just add 50 to check if it works, don't delete DB (Reset=False)
ingest_high_quality_movies(target_count=50, reset=False) |