Spaces:

Jaykay73
/

match-api

Sleeping

App Files Files Community

JermaineAI commited on Dec 12, 2025

Commit

c6769c2

1 Parent(s): 22e5dbc

Clean deploy without venv

Browse files

Files changed (19) hide show

.gitattributes +2 -0
.gitignore +35 -0
.vscode/settings.json +4 -0
Dockerfile +25 -0
app.py +164 -0
eda/.ipynb_checkpoints/Untitled-checkpoint.ipynb +6 -0
eda/Untitled.ipynb +1251 -0
main.py +28 -0
models/metadata.pkl +3 -0
models/movie_index.faiss +3 -0
requirements.txt +5 -0
src/__init__.py +0 -0
src/__pycache__/preprocessing.cpython-311.pyc +0 -0
src/__pycache__/preprocessing.cpython-312.pyc +0 -0
src/__pycache__/recommender.cpython-311.pyc +0 -0
src/ingest.py +121 -0
src/preprocessing.py +56 -0
src/recommender.py +156 -0
test.py +19 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.faiss filter=lfs diff=lfs merge=lfs -text
2	+ *.pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+__pycache__/
+*.py[cod]
+*$py.class
+.venv/
+venv/
+env/
+data/
+*.csv
+*.json
+eda/
+*.ipynb
+models/
+*.faiss
+*.pkl
+*.bin
+*.pt
+.env
+.vscode/
+.idea/
+.DS_Store
+Thumbs.db.venv/
+data/

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "python-envs.pythonProjects": [],
+    "python-envs.defaultEnvManager": "ms-python.python:venv"
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Use Python 3.9 or 3.10 (Slim version to save space)
+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Copy files
+COPY requirements.txt .
+# Install dependencies
+# We add --no-cache-dir to keep the image small
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Create the directory for models if it doesn't exist
+RUN mkdir -p models
+# Expose port 7860 (Hugging Face default)
+EXPOSE 7860
+# Command to run the app
+# Note: Hugging Face expects the app on port 7860, not 8000
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional
+import os
+import uvicorn
+# Import our custom modules
+from src.recommender import MovieRecommender
+from src.ingest import run_weekly_update
+# --- Configuration ---
+app = FastAPI(
+    title="CineMatch API",
+    description="A content-based movie recommender using FAISS & Transformers.",
+    version="1.0.0"
+)
+# Enable CORS (Allows your future mobile app/website to talk to this API)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global State for the AI Model
+rec_engine: Optional[MovieRecommender] = None
+# --- Lifespan Events (Startup/Shutdown) ---
+@app.on_event("startup")
+async def startup_event():
+    """
+    Load the heavy AI model once when the server starts.
+    """
+    global rec_engine
+    rec_engine = MovieRecommender()
+    if os.path.exists('models/movie_index.faiss'):
+        print(" [INFO] Loading AI Model from disk...")
+        rec_engine.load('models/')
+        print(f" [INFO] Model loaded. Index contains {rec_engine.index.ntotal} movies.")
+    else:
+        print(" [WARNING] No model found at 'models/'. API will return errors until model is built.")
+# --- Pydantic Data Models (Schema) ---
+class SearchRequest(BaseModel):
+    query: str
+    k: int = 10
+class VibeRequest(BaseModel):
+    tags: List[str] = []     # e.g., ["Sci-Fi", "90s"]
+    description: str = ""    # e.g., "Robots fighting in space"
+    k: int = 10
+class UserHistoryRequest(BaseModel):
+    liked_movies: List[str]  # e.g., ["The Matrix", "Inception"]
+    k: int = 10
+class MovieResponse(BaseModel):
+    movie_id: int
+    title: str
+    score: float
+# --- Helper Function ---
+def check_model():
+    if not rec_engine or not rec_engine.index:
+        raise HTTPException(status_code=503, detail="AI Model is not loaded. Run ingestion first.")
+# --- API Endpoints ---
+@app.get("/")
+def health_check():
+    """Simple check to see if server is running."""
+    return {"status": "online and active!!!", "model_loaded": rec_engine is not None}
+@app.post("/search", response_model=List[MovieResponse])
+def search_movies(request: SearchRequest):
+    """
+    Semantic Search: Convert query to vector -> Find nearest movies.
+    """
+    check_model()
+    results = rec_engine.recommend_on_text(request.query, k=request.k)
+    return results
+@app.post("/recommend/vibe", response_model=dict)
+def vibe_check(request: VibeRequest):
+    """
+    Recommends based on a mix of Tags and Description.
+    """
+    check_model()
+    # Construct the "Soup"
+    # We repeat tags to give them more weight in the vector space
+    tag_str = " ".join(request.tags) * 2
+    query_soup = f"{tag_str} {request.description}".strip()
+    if not query_soup:
+        raise HTTPException(status_code=400, detail="Please provide at least one tag or description.")
+    results = rec_engine.recommend_on_text(query_soup, k=request.k)
+    return {
+        "interpreted_query": query_soup,
+        "results": results
+    }
+@app.post("/recommend/user", response_model=List[MovieResponse])
+def recommend_for_user(request: UserHistoryRequest):
+    """
+    Takes a list of movie titles the user likes, averages their vectors,
+    and finds similar movies.
+    """
+    check_model()
+    results = rec_engine.recommend_for_user(request.liked_movies, k=request.k)
+    # Handle case where no movies were matched
+    if isinstance(results, str):
+        raise HTTPException(status_code=404, detail=results)
+    return results
+@app.get("/recommend/movie/{title}", response_model=List[MovieResponse])
+def recommend_similar_movie(title: str):
+    """
+    Classic 'Users who liked X also liked Y' (Content-based version).
+    """
+    check_model()
+    results = rec_engine.recommend_by_movie(title)
+    if isinstance(results, str):
+        raise HTTPException(status_code=404, detail=results)
+    return results
+# --- Admin / Operations ---
+def background_update_task():
+    """
+    Runs the ingestion script and reloads the model in memory.
+    """
+    print(" [BACKGROUND] Starting update process...")
+    # 1. Run the ingestion (Fetch from TMDB + Save to Disk)
+    run_weekly_update()
+    # 2. Reload the model in memory so the API sees the new movies immediately
+    print(" [BACKGROUND] Reloading model into RAM...")
+    rec_engine.load('models/')
+    print(" [BACKGROUND] Update complete. Model reloaded.")
+@app.post("/admin/trigger-update")
+def trigger_update(background_tasks: BackgroundTasks):
+    """
+    Manually triggers the 'Weekly Update' logic.
+    Runs in the background so the API doesn't freeze.
+    """
+    background_tasks.add_task(background_update_task)
+    return {"message": "Update process started in background. Check server logs for progress."}
+# --- Entry Point ---
+if __name__ == "__main__":
+    # Use this for debugging. In production, use the command line.
+    uvicorn.run(app, host="0.0.0.0", port=8000)

eda/.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

eda/Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,1251 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "faab86b6-6af3-4b86-8288-534228ce01ea",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting pandas\n",
+      "  Using cached pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)\n",
+      "Requirement already satisfied: numpy in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (2.3.3)\n",
+      "Collecting seaborn\n",
+      "  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from pandas) (2.9.0.post0)\n",
+      "Collecting pytz>=2020.1 (from pandas)\n",
+      "  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from pandas) (2025.2)\n",
+      "Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)\n",
+      "  Downloading matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)\n",
+      "Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
+      "  Downloading contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.5 kB)\n",
+      "Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
+      "  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
+      "Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
+      "  Downloading fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (113 kB)\n",
+      "\u001b[2K     \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.2/113.2 kB\u001b[0m \u001b[31m276.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
+      "  Downloading kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (6.3 kB)\n",
+      "Requirement already satisfied: packaging>=20.0 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n",
+      "Requirement already satisfied: pillow>=8 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.3.0)\n",
+      "Collecting pyparsing>=3 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
+      "  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)\n",
+      "Requirement already satisfied: six>=1.5 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
+      "Using cached pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)\n",
+      "Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m409.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m745.6 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m0m\n",
+      "\u001b[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n",
+      "Downloading contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (355 kB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m355.2/355.2 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
+      "Downloading fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (5.0 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
+      "\u001b[?25hDownloading kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (1.4 MB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m���━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
+      "\u001b[?25hDownloading pyparsing-3.2.5-py3-none-any.whl (113 kB)\n",
+      "\u001b[2K   \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.9/113.9 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: pytz, pyparsing, kiwisolver, fonttools, cycler, contourpy, pandas, matplotlib, seaborn\n",
+      "Successfully installed contourpy-1.3.3 cycler-0.12.1 fonttools-4.61.0 kiwisolver-1.4.9 matplotlib-3.10.7 pandas-2.3.3 pyparsing-3.2.5 pytz-2025.2 seaborn-0.13.2\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install pandas numpy seaborn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6c83aa85-43a2-4941-8ec4-510010ed72c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6c692a3d-e17e-458b-9cfd-9edc2b9c76e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"~/Desktop/CineMatch/data/movies_metadata.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "761e4049-76e8-4637-b34a-f922a8237836",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>adult</th>\n",
+       "      <th>belongs_to_collection</th>\n",
+       "      <th>budget</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>homepage</th>\n",
+       "      <th>id</th>\n",
+       "      <th>imdb_id</th>\n",
+       "      <th>original_language</th>\n",
+       "      <th>original_title</th>\n",
+       "      <th>overview</th>\n",
+       "      <th>...</th>\n",
+       "      <th>release_date</th>\n",
+       "      <th>revenue</th>\n",
+       "      <th>runtime</th>\n",
+       "      <th>spoken_languages</th>\n",
+       "      <th>status</th>\n",
+       "      <th>tagline</th>\n",
+       "      <th>title</th>\n",
+       "      <th>video</th>\n",
+       "      <th>vote_average</th>\n",
+       "      <th>vote_count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>False</td>\n",
+       "      <td>{'id': 10194, 'name': 'Toy Story Collection', ...</td>\n",
+       "      <td>30000000</td>\n",
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
+       "      <td>http://toystory.disney.com/toy-story</td>\n",
+       "      <td>862</td>\n",
+       "      <td>tt0114709</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Toy Story</td>\n",
+       "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1995-10-30</td>\n",
+       "      <td>373554033.0</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Toy Story</td>\n",
+       "      <td>False</td>\n",
+       "      <td>7.7</td>\n",
+       "      <td>5415.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>65000000</td>\n",
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>8844</td>\n",
+       "      <td>tt0113497</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Jumanji</td>\n",
+       "      <td>When siblings Judy and Peter discover an encha...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1995-12-15</td>\n",
+       "      <td>262797249.0</td>\n",
+       "      <td>104.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>Roll the dice and unleash the excitement!</td>\n",
+       "      <td>Jumanji</td>\n",
+       "      <td>False</td>\n",
+       "      <td>6.9</td>\n",
+       "      <td>2413.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>False</td>\n",
+       "      <td>{'id': 119050, 'name': 'Grumpy Old Men Collect...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>15602</td>\n",
+       "      <td>tt0113228</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Grumpier Old Men</td>\n",
+       "      <td>A family wedding reignites the ancient feud be...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1995-12-22</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>101.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>Still Yelling. Still Fighting. Still Ready for...</td>\n",
+       "      <td>Grumpier Old Men</td>\n",
+       "      <td>False</td>\n",
+       "      <td>6.5</td>\n",
+       "      <td>92.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>16000000</td>\n",
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>31357</td>\n",
+       "      <td>tt0114885</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Waiting to Exhale</td>\n",
+       "      <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1995-12-22</td>\n",
+       "      <td>81452156.0</td>\n",
+       "      <td>127.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>Friends are the people who let you be yourself...</td>\n",
+       "      <td>Waiting to Exhale</td>\n",
+       "      <td>False</td>\n",
+       "      <td>6.1</td>\n",
+       "      <td>34.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>False</td>\n",
+       "      <td>{'id': 96871, 'name': 'Father of the Bride Col...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>11862</td>\n",
+       "      <td>tt0113041</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Father of the Bride Part II</td>\n",
+       "      <td>Just when George Banks has recovered from his ...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1995-02-10</td>\n",
+       "      <td>76578911.0</td>\n",
+       "      <td>106.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>Just When His World Is Back To Normal... He's ...</td>\n",
+       "      <td>Father of the Bride Part II</td>\n",
+       "      <td>False</td>\n",
+       "      <td>5.7</td>\n",
+       "      <td>173.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45461</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...</td>\n",
+       "      <td>http://www.imdb.com/title/tt6209470/</td>\n",
+       "      <td>439050</td>\n",
+       "      <td>tt6209470</td>\n",
+       "      <td>fa</td>\n",
+       "      <td>رگ خواب</td>\n",
+       "      <td>Rising and falling between a man and woman.</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>[{'iso_639_1': 'fa', 'name': 'فارسی'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>Rising and falling between a man and woman</td>\n",
+       "      <td>Subdue</td>\n",
+       "      <td>False</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45462</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 18, 'name': 'Drama'}]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>111109</td>\n",
+       "      <td>tt2028550</td>\n",
+       "      <td>tl</td>\n",
+       "      <td>Siglo ng Pagluluwal</td>\n",
+       "      <td>An artist struggles to finish his work while a...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2011-11-17</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>360.0</td>\n",
+       "      <td>[{'iso_639_1': 'tl', 'name': ''}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Century of Birthing</td>\n",
+       "      <td>False</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45463</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>67758</td>\n",
+       "      <td>tt0303758</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Betrayal</td>\n",
+       "      <td>When one of her hits goes wrong, a professiona...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2003-08-01</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>A deadly game of wits.</td>\n",
+       "      <td>Betrayal</td>\n",
+       "      <td>False</td>\n",
+       "      <td>3.8</td>\n",
+       "      <td>6.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45464</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>227506</td>\n",
+       "      <td>tt0008536</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Satana likuyushchiy</td>\n",
+       "      <td>In a small town live two brothers, one a minis...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1917-10-21</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>87.0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Satan Triumphant</td>\n",
+       "      <td>False</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45465</th>\n",
+       "      <td>False</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>461257</td>\n",
+       "      <td>tt6980792</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Queerama</td>\n",
+       "      <td>50 years after decriminalisation of homosexual...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2017-06-09</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>75.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Queerama</td>\n",
+       "      <td>False</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>45466 rows × 24 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       adult                              belongs_to_collection    budget  \\\n",
+       "0      False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   \n",
+       "1      False                                                NaN  65000000   \n",
+       "2      False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   \n",
+       "3      False                                                NaN  16000000   \n",
+       "4      False  {'id': 96871, 'name': 'Father of the Bride Col...         0   \n",
+       "...      ...                                                ...       ...   \n",
+       "45461  False                                                NaN         0   \n",
+       "45462  False                                                NaN         0   \n",
+       "45463  False                                                NaN         0   \n",
+       "45464  False                                                NaN         0   \n",
+       "45465  False                                                NaN         0   \n",
+       "\n",
+       "                                                  genres  \\\n",
+       "0      [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   \n",
+       "1      [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   \n",
+       "2      [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   \n",
+       "3      [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   \n",
+       "4                         [{'id': 35, 'name': 'Comedy'}]   \n",
+       "...                                                  ...   \n",
+       "45461  [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...   \n",
+       "45462                      [{'id': 18, 'name': 'Drama'}]   \n",
+       "45463  [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...   \n",
+       "45464                                                 []   \n",
+       "45465                                                 []   \n",
+       "\n",
+       "                                   homepage      id    imdb_id  \\\n",
+       "0      http://toystory.disney.com/toy-story     862  tt0114709   \n",
+       "1                                       NaN    8844  tt0113497   \n",
+       "2                                       NaN   15602  tt0113228   \n",
+       "3                                       NaN   31357  tt0114885   \n",
+       "4                                       NaN   11862  tt0113041   \n",
+       "...                                     ...     ...        ...   \n",
+       "45461  http://www.imdb.com/title/tt6209470/  439050  tt6209470   \n",
+       "45462                                   NaN  111109  tt2028550   \n",
+       "45463                                   NaN   67758  tt0303758   \n",
+       "45464                                   NaN  227506  tt0008536   \n",
+       "45465                                   NaN  461257  tt6980792   \n",
+       "\n",
+       "      original_language               original_title  \\\n",
+       "0                    en                    Toy Story   \n",
+       "1                    en                      Jumanji   \n",
+       "2                    en             Grumpier Old Men   \n",
+       "3                    en            Waiting to Exhale   \n",
+       "4                    en  Father of the Bride Part II   \n",
+       "...                 ...                          ...   \n",
+       "45461                fa                      رگ خواب   \n",
+       "45462                tl          Siglo ng Pagluluwal   \n",
+       "45463                en                     Betrayal   \n",
+       "45464                en          Satana likuyushchiy   \n",
+       "45465                en                     Queerama   \n",
+       "\n",
+       "                                                overview  ... release_date  \\\n",
+       "0      Led by Woody, Andy's toys live happily in his ...  ...   1995-10-30   \n",
+       "1      When siblings Judy and Peter discover an encha...  ...   1995-12-15   \n",
+       "2      A family wedding reignites the ancient feud be...  ...   1995-12-22   \n",
+       "3      Cheated on, mistreated and stepped on, the wom...  ...   1995-12-22   \n",
+       "4      Just when George Banks has recovered from his ...  ...   1995-02-10   \n",
+       "...                                                  ...  ...          ...   \n",
+       "45461        Rising and falling between a man and woman.  ...          NaN   \n",
+       "45462  An artist struggles to finish his work while a...  ...   2011-11-17   \n",
+       "45463  When one of her hits goes wrong, a professiona...  ...   2003-08-01   \n",
+       "45464  In a small town live two brothers, one a minis...  ...   1917-10-21   \n",
+       "45465  50 years after decriminalisation of homosexual...  ...   2017-06-09   \n",
+       "\n",
+       "           revenue runtime                                   spoken_languages  \\\n",
+       "0      373554033.0    81.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "1      262797249.0   104.0  [{'iso_639_1': 'en', 'name': 'English'}, {'iso...   \n",
+       "2              0.0   101.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "3       81452156.0   127.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "4       76578911.0   106.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "...            ...     ...                                                ...   \n",
+       "45461          0.0    90.0             [{'iso_639_1': 'fa', 'name': 'فارسی'}]   \n",
+       "45462          0.0   360.0                  [{'iso_639_1': 'tl', 'name': ''}]   \n",
+       "45463          0.0    90.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "45464          0.0    87.0                                                 []   \n",
+       "45465          0.0    75.0           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "\n",
+       "         status                                            tagline  \\\n",
+       "0      Released                                                NaN   \n",
+       "1      Released          Roll the dice and unleash the excitement!   \n",
+       "2      Released  Still Yelling. Still Fighting. Still Ready for...   \n",
+       "3      Released  Friends are the people who let you be yourself...   \n",
+       "4      Released  Just When His World Is Back To Normal... He's ...   \n",
+       "...         ...                                                ...   \n",
+       "45461  Released         Rising and falling between a man and woman   \n",
+       "45462  Released                                                NaN   \n",
+       "45463  Released                             A deadly game of wits.   \n",
+       "45464  Released                                                NaN   \n",
+       "45465  Released                                                NaN   \n",
+       "\n",
+       "                             title  video vote_average vote_count  \n",
+       "0                        Toy Story  False          7.7     5415.0  \n",
+       "1                          Jumanji  False          6.9     2413.0  \n",
+       "2                 Grumpier Old Men  False          6.5       92.0  \n",
+       "3                Waiting to Exhale  False          6.1       34.0  \n",
+       "4      Father of the Bride Part II  False          5.7      173.0  \n",
+       "...                            ...    ...          ...        ...  \n",
+       "45461                       Subdue  False          4.0        1.0  \n",
+       "45462          Century of Birthing  False          9.0        3.0  \n",
+       "45463                     Betrayal  False          3.8        6.0  \n",
+       "45464             Satan Triumphant  False          0.0        0.0  \n",
+       "45465                     Queerama  False          0.0        0.0  \n",
+       "\n",
+       "[45466 rows x 24 columns]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "010b9a7e-2075-4119-9b3e-6a79970245c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 45466 entries, 0 to 45465\n",
+      "Data columns (total 24 columns):\n",
+      " #   Column                 Non-Null Count  Dtype  \n",
+      "---  ------                 --------------  -----  \n",
+      " 0   adult                  45466 non-null  object \n",
+      " 1   belongs_to_collection  4494 non-null   object \n",
+      " 2   budget                 45466 non-null  object \n",
+      " 3   genres                 45466 non-null  object \n",
+      " 4   homepage               7782 non-null   object \n",
+      " 5   id                     45466 non-null  object \n",
+      " 6   imdb_id                45449 non-null  object \n",
+      " 7   original_language      45455 non-null  object \n",
+      " 8   original_title         45466 non-null  object \n",
+      " 9   overview               44512 non-null  object \n",
+      " 10  popularity             45461 non-null  object \n",
+      " 11  poster_path            45080 non-null  object \n",
+      " 12  production_companies   45463 non-null  object \n",
+      " 13  production_countries   45463 non-null  object \n",
+      " 14  release_date           45379 non-null  object \n",
+      " 15  revenue                45460 non-null  float64\n",
+      " 16  runtime                45203 non-null  float64\n",
+      " 17  spoken_languages       45460 non-null  object \n",
+      " 18  status                 45379 non-null  object \n",
+      " 19  tagline                20412 non-null  object \n",
+      " 20  title                  45460 non-null  object \n",
+      " 21  video                  45460 non-null  object \n",
+      " 22  vote_average           45460 non-null  float64\n",
+      " 23  vote_count             45460 non-null  float64\n",
+      "dtypes: float64(4), object(20)\n",
+      "memory usage: 8.3+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "e17d1cb7-6446-40c7-acf8-05934b5b66c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "adult                                                                False\n",
+       "belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...\n",
+       "budget                                                            30000000\n",
+       "genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...\n",
+       "homepage                              http://toystory.disney.com/toy-story\n",
+       "id                                                                     862\n",
+       "imdb_id                                                          tt0114709\n",
+       "original_language                                                       en\n",
+       "original_title                                                   Toy Story\n",
+       "overview                 Led by Woody, Andy's toys live happily in his ...\n",
+       "popularity                                                       21.946943\n",
+       "poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg\n",
+       "production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]\n",
+       "production_countries     [{'iso_3166_1': 'US', 'name': 'United States o...\n",
+       "release_date                                                    1995-10-30\n",
+       "revenue                                                        373554033.0\n",
+       "runtime                                                               81.0\n",
+       "spoken_languages                  [{'iso_639_1': 'en', 'name': 'English'}]\n",
+       "status                                                            Released\n",
+       "tagline                                                                NaN\n",
+       "title                                                            Toy Story\n",
+       "video                                                                False\n",
+       "vote_average                                                           7.7\n",
+       "vote_count                                                          5415.0\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "0dc44094-31b5-4870-81db-16d3bc96e59e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop(['vote_count','vote_average','video'], axis=1,inplace=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "12d1ffb9-5cf2-4f7d-8a92-85c8471233bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}\""
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['belongs_to_collection'][2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "33cce957-eed8-429a-b9f7-1ce83e4ec49c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop('belongs_to_collection',axis=1,inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "ced88473-c105-4e0a-808b-f9903c8b8643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop(['homepage','status',],axis=1,inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "519facbd-91c9-4784-aa32-44a0d9edb24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>adult</th>\n",
+       "      <th>budget</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>id</th>\n",
+       "      <th>imdb_id</th>\n",
+       "      <th>original_language</th>\n",
+       "      <th>original_title</th>\n",
+       "      <th>overview</th>\n",
+       "      <th>popularity</th>\n",
+       "      <th>poster_path</th>\n",
+       "      <th>production_companies</th>\n",
+       "      <th>production_countries</th>\n",
+       "      <th>release_date</th>\n",
+       "      <th>revenue</th>\n",
+       "      <th>runtime</th>\n",
+       "      <th>spoken_languages</th>\n",
+       "      <th>tagline</th>\n",
+       "      <th>title</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>False</td>\n",
+       "      <td>30000000</td>\n",
+       "      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
+       "      <td>862</td>\n",
+       "      <td>tt0114709</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Toy Story</td>\n",
+       "      <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
+       "      <td>21.946943</td>\n",
+       "      <td>/rhIRbceoE9lR4veEXuwCC2wARtG.jpg</td>\n",
+       "      <td>[{'name': 'Pixar Animation Studios', 'id': 3}]</td>\n",
+       "      <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
+       "      <td>1995-10-30</td>\n",
+       "      <td>373554033.0</td>\n",
+       "      <td>81.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Toy Story</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>False</td>\n",
+       "      <td>65000000</td>\n",
+       "      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
+       "      <td>8844</td>\n",
+       "      <td>tt0113497</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Jumanji</td>\n",
+       "      <td>When siblings Judy and Peter discover an encha...</td>\n",
+       "      <td>17.015539</td>\n",
+       "      <td>/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg</td>\n",
+       "      <td>[{'name': 'TriStar Pictures', 'id': 559}, {'na...</td>\n",
+       "      <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
+       "      <td>1995-12-15</td>\n",
+       "      <td>262797249.0</td>\n",
+       "      <td>104.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n",
+       "      <td>Roll the dice and unleash the excitement!</td>\n",
+       "      <td>Jumanji</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
+       "      <td>15602</td>\n",
+       "      <td>tt0113228</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Grumpier Old Men</td>\n",
+       "      <td>A family wedding reignites the ancient feud be...</td>\n",
+       "      <td>11.7129</td>\n",
+       "      <td>/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg</td>\n",
+       "      <td>[{'name': 'Warner Bros.', 'id': 6194}, {'name'...</td>\n",
+       "      <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
+       "      <td>1995-12-22</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>101.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Still Yelling. Still Fighting. Still Ready for...</td>\n",
+       "      <td>Grumpier Old Men</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>False</td>\n",
+       "      <td>16000000</td>\n",
+       "      <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
+       "      <td>31357</td>\n",
+       "      <td>tt0114885</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Waiting to Exhale</td>\n",
+       "      <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
+       "      <td>3.859495</td>\n",
+       "      <td>/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg</td>\n",
+       "      <td>[{'name': 'Twentieth Century Fox Film Corporat...</td>\n",
+       "      <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
+       "      <td>1995-12-22</td>\n",
+       "      <td>81452156.0</td>\n",
+       "      <td>127.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Friends are the people who let you be yourself...</td>\n",
+       "      <td>Waiting to Exhale</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
+       "      <td>11862</td>\n",
+       "      <td>tt0113041</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Father of the Bride Part II</td>\n",
+       "      <td>Just when George Banks has recovered from his ...</td>\n",
+       "      <td>8.387519</td>\n",
+       "      <td>/e64sOI48hQXyru7naBFyssKFxVd.jpg</td>\n",
+       "      <td>[{'name': 'Sandollar Productions', 'id': 5842}...</td>\n",
+       "      <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
+       "      <td>1995-02-10</td>\n",
+       "      <td>76578911.0</td>\n",
+       "      <td>106.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>Just When His World Is Back To Normal... He's ...</td>\n",
+       "      <td>Father of the Bride Part II</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45461</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...</td>\n",
+       "      <td>439050</td>\n",
+       "      <td>tt6209470</td>\n",
+       "      <td>fa</td>\n",
+       "      <td>رگ خواب</td>\n",
+       "      <td>Rising and falling between a man and woman.</td>\n",
+       "      <td>0.072051</td>\n",
+       "      <td>/jldsYflnId4tTWPx8es3uzsB1I8.jpg</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[{'iso_3166_1': 'IR', 'name': 'Iran'}]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>[{'iso_639_1': 'fa', 'name': 'فارسی'}]</td>\n",
+       "      <td>Rising and falling between a man and woman</td>\n",
+       "      <td>Subdue</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45462</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 18, 'name': 'Drama'}]</td>\n",
+       "      <td>111109</td>\n",
+       "      <td>tt2028550</td>\n",
+       "      <td>tl</td>\n",
+       "      <td>Siglo ng Pagluluwal</td>\n",
+       "      <td>An artist struggles to finish his work while a...</td>\n",
+       "      <td>0.178241</td>\n",
+       "      <td>/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg</td>\n",
+       "      <td>[{'name': 'Sine Olivia', 'id': 19653}]</td>\n",
+       "      <td>[{'iso_3166_1': 'PH', 'name': 'Philippines'}]</td>\n",
+       "      <td>2011-11-17</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>360.0</td>\n",
+       "      <td>[{'iso_639_1': 'tl', 'name': ''}]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Century of Birthing</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45463</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...</td>\n",
+       "      <td>67758</td>\n",
+       "      <td>tt0303758</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Betrayal</td>\n",
+       "      <td>When one of her hits goes wrong, a professiona...</td>\n",
+       "      <td>0.903007</td>\n",
+       "      <td>/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg</td>\n",
+       "      <td>[{'name': 'American World Pictures', 'id': 6165}]</td>\n",
+       "      <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
+       "      <td>2003-08-01</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>A deadly game of wits.</td>\n",
+       "      <td>Betrayal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45464</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>227506</td>\n",
+       "      <td>tt0008536</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Satana likuyushchiy</td>\n",
+       "      <td>In a small town live two brothers, one a minis...</td>\n",
+       "      <td>0.003503</td>\n",
+       "      <td>/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg</td>\n",
+       "      <td>[{'name': 'Yermoliev', 'id': 88753}]</td>\n",
+       "      <td>[{'iso_3166_1': 'RU', 'name': 'Russia'}]</td>\n",
+       "      <td>1917-10-21</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>87.0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Satan Triumphant</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45465</th>\n",
+       "      <td>False</td>\n",
+       "      <td>0</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>461257</td>\n",
+       "      <td>tt6980792</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Queerama</td>\n",
+       "      <td>50 years after decriminalisation of homosexual...</td>\n",
+       "      <td>0.163015</td>\n",
+       "      <td>/s5UkZt6NTsrS7ZF0Rh8nzupRlIU.jpg</td>\n",
+       "      <td>[]</td>\n",
+       "      <td>[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]</td>\n",
+       "      <td>2017-06-09</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>75.0</td>\n",
+       "      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Queerama</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>45466 rows × 18 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       adult    budget                                             genres  \\\n",
+       "0      False  30000000  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   \n",
+       "1      False  65000000  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   \n",
+       "2      False         0  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   \n",
+       "3      False  16000000  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   \n",
+       "4      False         0                     [{'id': 35, 'name': 'Comedy'}]   \n",
+       "...      ...       ...                                                ...   \n",
+       "45461  False         0  [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...   \n",
+       "45462  False         0                      [{'id': 18, 'name': 'Drama'}]   \n",
+       "45463  False         0  [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...   \n",
+       "45464  False         0                                                 []   \n",
+       "45465  False         0                                                 []   \n",
+       "\n",
+       "           id    imdb_id original_language               original_title  \\\n",
+       "0         862  tt0114709                en                    Toy Story   \n",
+       "1        8844  tt0113497                en                      Jumanji   \n",
+       "2       15602  tt0113228                en             Grumpier Old Men   \n",
+       "3       31357  tt0114885                en            Waiting to Exhale   \n",
+       "4       11862  tt0113041                en  Father of the Bride Part II   \n",
+       "...       ...        ...               ...                          ...   \n",
+       "45461  439050  tt6209470                fa                      رگ خواب   \n",
+       "45462  111109  tt2028550                tl          Siglo ng Pagluluwal   \n",
+       "45463   67758  tt0303758                en                     Betrayal   \n",
+       "45464  227506  tt0008536                en          Satana likuyushchiy   \n",
+       "45465  461257  tt6980792                en                     Queerama   \n",
+       "\n",
+       "                                                overview popularity  \\\n",
+       "0      Led by Woody, Andy's toys live happily in his ...  21.946943   \n",
+       "1      When siblings Judy and Peter discover an encha...  17.015539   \n",
+       "2      A family wedding reignites the ancient feud be...    11.7129   \n",
+       "3      Cheated on, mistreated and stepped on, the wom...   3.859495   \n",
+       "4      Just when George Banks has recovered from his ...   8.387519   \n",
+       "...                                                  ...        ...   \n",
+       "45461        Rising and falling between a man and woman.   0.072051   \n",
+       "45462  An artist struggles to finish his work while a...   0.178241   \n",
+       "45463  When one of her hits goes wrong, a professiona...   0.903007   \n",
+       "45464  In a small town live two brothers, one a minis...   0.003503   \n",
+       "45465  50 years after decriminalisation of homosexual...   0.163015   \n",
+       "\n",
+       "                            poster_path  \\\n",
+       "0      /rhIRbceoE9lR4veEXuwCC2wARtG.jpg   \n",
+       "1      /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg   \n",
+       "2      /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg   \n",
+       "3      /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg   \n",
+       "4      /e64sOI48hQXyru7naBFyssKFxVd.jpg   \n",
+       "...                                 ...   \n",
+       "45461  /jldsYflnId4tTWPx8es3uzsB1I8.jpg   \n",
+       "45462  /xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg   \n",
+       "45463  /d5bX92nDsISNhu3ZT69uHwmfCGw.jpg   \n",
+       "45464  /aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg   \n",
+       "45465  /s5UkZt6NTsrS7ZF0Rh8nzupRlIU.jpg   \n",
+       "\n",
+       "                                    production_companies  \\\n",
+       "0         [{'name': 'Pixar Animation Studios', 'id': 3}]   \n",
+       "1      [{'name': 'TriStar Pictures', 'id': 559}, {'na...   \n",
+       "2      [{'name': 'Warner Bros.', 'id': 6194}, {'name'...   \n",
+       "3      [{'name': 'Twentieth Century Fox Film Corporat...   \n",
+       "4      [{'name': 'Sandollar Productions', 'id': 5842}...   \n",
+       "...                                                  ...   \n",
+       "45461                                                 []   \n",
+       "45462             [{'name': 'Sine Olivia', 'id': 19653}]   \n",
+       "45463  [{'name': 'American World Pictures', 'id': 6165}]   \n",
+       "45464               [{'name': 'Yermoliev', 'id': 88753}]   \n",
+       "45465                                                 []   \n",
+       "\n",
+       "                                    production_countries release_date  \\\n",
+       "0      [{'iso_3166_1': 'US', 'name': 'United States o...   1995-10-30   \n",
+       "1      [{'iso_3166_1': 'US', 'name': 'United States o...   1995-12-15   \n",
+       "2      [{'iso_3166_1': 'US', 'name': 'United States o...   1995-12-22   \n",
+       "3      [{'iso_3166_1': 'US', 'name': 'United States o...   1995-12-22   \n",
+       "4      [{'iso_3166_1': 'US', 'name': 'United States o...   1995-02-10   \n",
+       "...                                                  ...          ...   \n",
+       "45461             [{'iso_3166_1': 'IR', 'name': 'Iran'}]          NaN   \n",
+       "45462      [{'iso_3166_1': 'PH', 'name': 'Philippines'}]   2011-11-17   \n",
+       "45463  [{'iso_3166_1': 'US', 'name': 'United States o...   2003-08-01   \n",
+       "45464           [{'iso_3166_1': 'RU', 'name': 'Russia'}]   1917-10-21   \n",
+       "45465   [{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]   2017-06-09   \n",
+       "\n",
+       "           revenue  runtime  \\\n",
+       "0      373554033.0     81.0   \n",
+       "1      262797249.0    104.0   \n",
+       "2              0.0    101.0   \n",
+       "3       81452156.0    127.0   \n",
+       "4       76578911.0    106.0   \n",
+       "...            ...      ...   \n",
+       "45461          0.0     90.0   \n",
+       "45462          0.0    360.0   \n",
+       "45463          0.0     90.0   \n",
+       "45464          0.0     87.0   \n",
+       "45465          0.0     75.0   \n",
+       "\n",
+       "                                        spoken_languages  \\\n",
+       "0               [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "1      [{'iso_639_1': 'en', 'name': 'English'}, {'iso...   \n",
+       "2               [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "3               [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "4               [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "...                                                  ...   \n",
+       "45461             [{'iso_639_1': 'fa', 'name': 'فارسی'}]   \n",
+       "45462                  [{'iso_639_1': 'tl', 'name': ''}]   \n",
+       "45463           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "45464                                                 []   \n",
+       "45465           [{'iso_639_1': 'en', 'name': 'English'}]   \n",
+       "\n",
+       "                                                 tagline  \\\n",
+       "0                                                    NaN   \n",
+       "1              Roll the dice and unleash the excitement!   \n",
+       "2      Still Yelling. Still Fighting. Still Ready for...   \n",
+       "3      Friends are the people who let you be yourself...   \n",
+       "4      Just When His World Is Back To Normal... He's ...   \n",
+       "...                                                  ...   \n",
+       "45461         Rising and falling between a man and woman   \n",
+       "45462                                                NaN   \n",
+       "45463                             A deadly game of wits.   \n",
+       "45464                                                NaN   \n",
+       "45465                                                NaN   \n",
+       "\n",
+       "                             title  \n",
+       "0                        Toy Story  \n",
+       "1                          Jumanji  \n",
+       "2                 Grumpier Old Men  \n",
+       "3                Waiting to Exhale  \n",
+       "4      Father of the Bride Part II  \n",
+       "...                            ...  \n",
+       "45461                       Subdue  \n",
+       "45462          Century of Birthing  \n",
+       "45463                     Betrayal  \n",
+       "45464             Satan Triumphant  \n",
+       "45465                     Queerama  \n",
+       "\n",
+       "[45466 rows x 18 columns]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "0f83fc57-1c94-44a1-a4b1-b51ca334097f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count     45461\n",
+       "unique    44176\n",
+       "top         0.0\n",
+       "freq         34\n",
+       "Name: popularity, dtype: object"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['popularity'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "e693550c-9eed-47ed-8de9-402919a59228",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>revenue</th>\n",
+       "      <th>runtime</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>4.546000e+04</td>\n",
+       "      <td>45203.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>1.120935e+07</td>\n",
+       "      <td>94.128199</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>6.433225e+07</td>\n",
+       "      <td>38.407810</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>85.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>95.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.000000e+00</td>\n",
+       "      <td>107.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>2.787965e+09</td>\n",
+       "      <td>1256.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            revenue       runtime\n",
+       "count  4.546000e+04  45203.000000\n",
+       "mean   1.120935e+07     94.128199\n",
+       "std    6.433225e+07     38.407810\n",
+       "min    0.000000e+00      0.000000\n",
+       "25%    0.000000e+00     85.000000\n",
+       "50%    0.000000e+00     95.000000\n",
+       "75%    0.000000e+00    107.000000\n",
+       "max    2.787965e+09   1256.000000"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ae3ea42-2343-4f8a-9740-adf7a11d7d5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop("
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

main.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from src.preprocessing import parse_features
+from src.recommender import MovieRecommender
+import time
+# 1. Setup paths
+# Make sure your csv is exactly at this path
+csv_path = 'data/movies_metadata.csv'
+# 2. Run Preprocessing
+# This handles the 45k rows and fixes the bad ID bug
+df = parse_features(csv_path)
+# 3. Initialize Recommender
+# Using the CPU-optimized class we discussed
+rec = MovieRecommender()
+# 4. Generate Embeddings
+# This is the heavy step. Go grab a coffee.
+start_time = time.time()
+rec.preprocess_and_embed(df)
+print(f"Total time taken: {(time.time() - start_time)/60:.2f} minutes")
+# 5. Save the result so you don't have to wait next time
+rec.save('models/')
+# 6. Quick Test
+print("\n--- Test Recommendation ---")
+print(rec.recommend_by_movie("Jumanji"))

models/metadata.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78472dfc4fcf4703b633dc55e20c3ac24dcb627e0ddef239d63d89314cf56716
+size 19236982

models/movie_index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f489fa4ce9d3a757267af4dacb3b9d3f7339bb9ebf798e0403e58fe14d2171a8
+size 69970989

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pandas>=1.3.0
+numpy>=1.21.0
+requests>=2.26.0
+sentence-transformers>=2.2.0
+faiss-cpu>=1.7.2

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (3.13 kB). View file

src/__pycache__/preprocessing.cpython-312.pyc ADDED Viewed

Binary file (2.63 kB). View file

src/__pycache__/recommender.cpython-311.pyc ADDED Viewed

Binary file (8.52 kB). View file

src/ingest.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from dotenv import load_dotenv
+import os
+import requests
+import pandas as pd
+from src.recommender import MovieRecommender
+load_dotenv()
+# CONFIGURATION
+API_KEY = os.getenv('API_KEY')
+BASE_URL = "https://api.themoviedb.org/3"
+def get_genre_map():
+    """
+    Fetches the official list of Genre IDs -> Names.
+    Example: {28: 'Action', 12: 'Adventure'}
+    """
+    url = f"{BASE_URL}/genre/movie/list"
+    params = {'api_key': API_KEY, 'language': 'en-US'}
+    try:
+        response = requests.get(url, params=params)
+        data = response.json()
+        # Turn list of dicts into a single lookup dictionary
+        mapping = {g['id']: g['name'] for g in data['genres']}
+        print("Genre map loaded.")
+        return mapping
+    except Exception as e:
+        print(f"Failed to load genres: {e}")
+        return {}
+def get_popular_movies(limit=100):
+    """
+    Fetches the current most popular movies.
+    """
+    movies = []
+    pages = (limit // 20) + 1
+    print(f"Fetching top {limit} popular movies...")
+    for page in range(1, pages + 1):
+        url = f"{BASE_URL}/movie/popular"
+        params = {
+            'api_key': API_KEY,
+            'language': 'en-US',
+            'page': page
+        }
+        try:
+            response = requests.get(url, params=params)
+            if response.status_code == 200:
+                results = response.json().get('results', [])
+                movies.extend(results)
+            else:
+                print(f"Error on page {page}: {response.status_code}")
+        except Exception as e:
+            print(f"Connection failed: {e}")
+        if len(movies) >= limit:
+            break
+    return movies[:limit]
+def run_weekly_update():
+    # 1. Load the Brain
+    print("Loading existing database...")
+    rec = MovieRecommender()
+    try:
+        rec.load('models/')
+        existing_ids = set(rec.movies['id'].values)
+        print(f"Database currently holds {len(existing_ids)} movies.")
+    except:
+        print("No existing model found. Cannot update empty model.")
+        return
+    # 2. Get Data
+    candidates = get_popular_movies(limit=100)
+    genre_map = get_genre_map() # <--- NEW STEP
+    # 3. Filter Duplicates
+    new_movies_to_process = []
+    for m in candidates:
+        if m['id'] not in existing_ids:
+            new_movies_to_process.append(m)
+    count = len(new_movies_to_process)
+    print(f"New additions found: {count}")
+    if count == 0:
+        return
+    # 4. Compute & Ingest
+    print(f"Processing {count} new movies...")
+    for m in new_movies_to_process:
+        # Resolve Genre IDs to Names
+        # m['genre_ids'] might look like [28, 12]
+        # We turn that into "Action Adventure"
+        current_genres = [genre_map.get(gid, '') for gid in m.get('genre_ids', [])]
+        genre_str = " ".join(current_genres)
+        # Build the Soup with Tags
+        # Structure: Title + Title + Genres + Overview
+        soup = f"{m['title']} {m['title']} {genre_str} {m.get('overview', '')}"
+        movie_data = {
+            'id': m['id'],
+            'title': m['title'],
+            'soup': soup
+        }
+        rec.add_new_movie(movie_data)
+        print(f" - Added: {m['title']} ({genre_str})")
+    # 5. Save
+    print("Saving updated index to disk...")
+    rec.save('models/')
+    print("Update Complete!")
+if __name__ == "__main__":
+    run_weekly_update()

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import pandas as pd
+import ast
+import numpy as np
+def clean_data(x):
+    """
+    Helper to convert stringified lists like "[{'name': 'Action'}]"
+    into a simple string "Action"
+    """
+    if isinstance(x, str):
+        try:
+            # Safely evaluate the string as a Python list/dict
+            item_list = ast.literal_eval(x)
+            if isinstance(item_list, list):
+                # Extract the 'name' key from each dict in the list
+                return ' '.join([i['name'] for i in item_list if 'name' in i])
+        except (ValueError, SyntaxError):
+            return ""
+    return ""
+def parse_features(data_path):
+    print(f"Loading data from {data_path}...")
+    # 1. Load Data
+    # 'on_bad_lines' skips the few corrupted rows in this specific dataset
+    df = pd.read_csv(data_path, low_memory=False)
+    # 2. Filter Bad IDs
+    # This dataset has a known bug where some IDs are dates (e.g., '1995-10-20')
+    # We force 'id' to numeric and drop rows that fail
+    df['id'] = pd.to_numeric(df['id'], errors='coerce')
+    df = df.dropna(subset=['id'])
+    df['id'] = df['id'].astype(int)
+    # 3. Fill NaNs
+    df['title'] = df['title'].fillna('')
+    df['overview'] = df['overview'].fillna('')
+    df['tagline'] = df['tagline'].fillna('')
+    df['genres'] = df['genres'].fillna('[]')
+    print("Parsing genres (this might take a moment)...")
+    # 4. Clean Genres
+    df['genre_names'] = df['genres'].apply(clean_data)
+    # 5. Create the "Soup"
+    # We combine Title (2x weight), Tagline, Overview, and Genres
+    def create_soup(x):
+        return f"{x['title']} {x['title']} {x['tagline']} {x['overview']} {x['genre_names']}"
+    df['soup'] = df.apply(create_soup, axis=1)
+    # 6. Return only what we need to save memory
+    final_df = df[['id', 'title', 'soup']].reset_index(drop=True)
+    print(f"Cleaned data: {len(final_df)} movies ready for embedding.")
+    return final_df

src/recommender.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import pandas as pd
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+import pickle
+import os
+class MovieRecommender:
+    def __init__(self, model_name='all-MiniLM-L6-v2'):
+        # Load the Transformer model
+        self.encoder = SentenceTransformer(model_name)
+        # Dimension of the embedding (384 for MiniLM)
+        self.d = 384
+        self.index = None
+        self.movies = pd.DataFrame()
+        self.id_map = {} # Maps FAISS index ID to Movie ID
+    def preprocess_and_embed(self, df):
+        self.movies = df.reset_index(drop=True)
+        # CPU OPTIMIZATION: Process in batches
+        # This prevents RAM spikes and shows you a progress bar
+        batch_size = 64
+        print(f"Generating embeddings on CPU in batches of {batch_size}...")
+        # sentence-transformers handles batching internally,
+        # but setting explicit batch_size helps on CPU
+        embeddings = self.encoder.encode(
+            self.movies['soup'].tolist(),
+            batch_size=batch_size,
+            show_progress_bar=True,
+            convert_to_numpy=True
+        )
+        print("Normalizing vectors...")
+        faiss.normalize_L2(embeddings)
+        print("Building Index...")
+        # IndexFlatIP is brute-force but highly optimized for CPU.
+        # It relies on BLAS libraries (MKL/OpenBLAS) which your CPU uses naturally.
+        self.index = faiss.IndexFlatIP(self.d)
+        self.index.add(embeddings)
+        self.id_map = {i: row['id'] for i, row in self.movies.iterrows()}
+        print(f"Done. Indexed {self.index.ntotal} movies.")
+    def search(self, query_vector, k=5):
+        """
+        Low-level search: Input vector -> Top K Movie IDs
+        """
+        # Ensure vector is 2D (1, d) and normalized
+        query_vector = query_vector.reshape(1, -1)
+        faiss.normalize_L2(query_vector)
+        # Search
+        distances, indices = self.index.search(query_vector, k)
+        results = []
+        for i, idx in enumerate(indices[0]):
+            if idx != -1: # FAISS returns -1 if not found
+                movie_id = self.id_map[idx]
+                results.append({
+                    "movie_id": int(movie_id),
+                    "score": float(distances[0][i]),
+                    "title": self.movies[self.movies['id'] == movie_id]['title'].values[0]
+                })
+        return results
+    def recommend_by_movie(self, movie_title, k=5):
+        """
+        Find movies similar to a specific movie title already in DB.
+        """
+        # Find the movie's vector
+        movie_row = self.movies[self.movies['title'].str.contains(movie_title, case=False)]
+        if movie_row.empty:
+            return "Movie not found."
+        # Re-encode is safer to ensure we have the exact vector logic,
+        # or we could cache the vectors in the dataframe to save time.
+        # Here we re-encode for simplicity.
+        vec = self.encoder.encode([movie_row.iloc[0]['soup']])
+        return self.search(vec, k)
+    def recommend_on_text(self, text_query, k=5):
+        """
+        Recommends movies based on a raw text description.
+        Example: "A romantic movie about a sinking ship" -> Titanic
+        """
+        print(f"Searching for: '{text_query}'...")
+        # 1. Encode the user's text into a vector
+        # This runs on the CPU in milliseconds because it's just one sentence.
+        query_vector = self.encoder.encode([text_query])
+        # 2. Normalize (Important for Cosine Similarity)
+        faiss.normalize_L2(query_vector)
+        # 3. Search the existing index
+        # We reuse the same search logic we defined earlier
+        return self.search(query_vector, k)
+    def recommend_for_user(self, liked_movie_titles, k=5):
+        """
+        The Personalized Logic.
+        1. Get vectors for all movies the user liked.
+        2. Average them to create a 'User Profile Vector'.
+        3. Search against the index.
+        """
+        vectors = []
+        for title in liked_movie_titles:
+            # Find movie in our DB
+            movie_row = self.movies[self.movies['title'].str.contains(title, case=False)]
+            if not movie_row.empty:
+                soup = movie_row.iloc[0]['soup']
+                vectors.append(self.encoder.encode(soup))
+        if not vectors:
+            return "No known movies provided."
+        # Average the vectors (User Profile)
+        user_vector = np.mean(vectors, axis=0)
+        return self.search(user_vector, k)
+    def add_new_movie(self, movie_dict):
+        """
+        Incremental Learning: Add a movie without retraining.
+        movie_dict: {'id': 123, 'title': 'New Movie', 'soup': 'description...'}
+        """
+        # 1. Vectorize the new soup
+        new_vec = self.encoder.encode([movie_dict['soup']])
+        faiss.normalize_L2(new_vec)
+        # 2. Add to FAISS Index (Instant)
+        self.index.add(new_vec)
+        # 3. Update Metadata DataFrame
+        # We append the new row to the pandas dataframe
+        new_row = pd.DataFrame([movie_dict])
+        self.movies = pd.concat([self.movies, new_row], ignore_index=True)
+        # 4. Update ID Map
+        # The new index ID is the last position in the dataframe
+        next_idx = len(self.movies) - 1
+        self.id_map[next_idx] = movie_dict['id']
+    def save(self, path='models/'):
+        """Persistence"""
+        os.makedirs(path, exist_ok=True)
+        faiss.write_index(self.index, os.path.join(path, 'movie_index.faiss'))
+        self.movies.to_pickle(os.path.join(path, 'metadata.pkl'))
+        # Note: We don't save the encoder, we load it fresh every time.
+    def load(self, path='models/'):
+        self.index = faiss.read_index(os.path.join(path, 'movie_index.faiss'))
+        self.movies = pd.read_pickle(os.path.join(path, 'metadata.pkl'))
+        self.id_map = {i: row['id'] for i, row in self.movies.iterrows()}

test.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from src.recommender import MovieRecommender
+rec = MovieRecommender()
+rec.load('models/')
+# Assuming 'rec' is your loaded MovieRecommender instance
+# Example 1: Vague description
+print(rec.recommend_on_text("running man"))
+# # Example 2: Specific vibe
+# print(rec.recommend_on_text("mind bending sci-fi about dreams within dreams"))
+# # Expected: Inception
+# # Example 3: Emotional query
+# print(rec.recommend_on_text("sad movie that makes me cry about a dog"))
+# # Expected: Hachi: A Dog's Tale or Marley & Me