JermaineAI commited on
Commit
c6769c2
·
1 Parent(s): 22e5dbc

Clean deploy without venv

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.faiss filter=lfs diff=lfs merge=lfs -text
2
+ *.pkl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+
7
+ .venv/
8
+ venv/
9
+ env/
10
+
11
+
12
+ data/
13
+ *.csv
14
+ *.json
15
+
16
+ eda/
17
+ *.ipynb
18
+
19
+
20
+
21
+ models/
22
+ *.faiss
23
+ *.pkl
24
+ *.bin
25
+ *.pt
26
+
27
+
28
+ .env
29
+
30
+ .vscode/
31
+ .idea/
32
+
33
+ .DS_Store
34
+ Thumbs.db.venv/
35
+ data/
.vscode/settings.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "python-envs.pythonProjects": [],
3
+ "python-envs.defaultEnvManager": "ms-python.python:venv"
4
+ }
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.9 or 3.10 (Slim version to save space)
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Copy files
8
+ COPY requirements.txt .
9
+
10
+ # Install dependencies
11
+ # We add --no-cache-dir to keep the image small
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy the rest of the application
15
+ COPY . .
16
+
17
+ # Create the directory for models if it doesn't exist
18
+ RUN mkdir -p models
19
+
20
+ # Expose port 7860 (Hugging Face default)
21
+ EXPOSE 7860
22
+
23
+ # Command to run the app
24
+ # Note: Hugging Face expects the app on port 7860, not 8000
25
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from pydantic import BaseModel
4
+ from typing import List, Optional
5
+ import os
6
+ import uvicorn
7
+
8
+ # Import our custom modules
9
+ from src.recommender import MovieRecommender
10
+ from src.ingest import run_weekly_update
11
+
12
+ # --- Configuration ---
13
+ app = FastAPI(
14
+ title="CineMatch API",
15
+ description="A content-based movie recommender using FAISS & Transformers.",
16
+ version="1.0.0"
17
+ )
18
+
19
+ # Enable CORS (Allows your future mobile app/website to talk to this API)
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"], # In production, replace with specific domain
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ # Global State for the AI Model
29
+ rec_engine: Optional[MovieRecommender] = None
30
+
31
+ # --- Lifespan Events (Startup/Shutdown) ---
32
+ @app.on_event("startup")
33
+ async def startup_event():
34
+ """
35
+ Load the heavy AI model once when the server starts.
36
+ """
37
+ global rec_engine
38
+ rec_engine = MovieRecommender()
39
+
40
+ if os.path.exists('models/movie_index.faiss'):
41
+ print(" [INFO] Loading AI Model from disk...")
42
+ rec_engine.load('models/')
43
+ print(f" [INFO] Model loaded. Index contains {rec_engine.index.ntotal} movies.")
44
+ else:
45
+ print(" [WARNING] No model found at 'models/'. API will return errors until model is built.")
46
+
47
+ # --- Pydantic Data Models (Schema) ---
48
+ class SearchRequest(BaseModel):
49
+ query: str
50
+ k: int = 10
51
+
52
+ class VibeRequest(BaseModel):
53
+ tags: List[str] = [] # e.g., ["Sci-Fi", "90s"]
54
+ description: str = "" # e.g., "Robots fighting in space"
55
+ k: int = 10
56
+
57
+ class UserHistoryRequest(BaseModel):
58
+ liked_movies: List[str] # e.g., ["The Matrix", "Inception"]
59
+ k: int = 10
60
+
61
+ class MovieResponse(BaseModel):
62
+ movie_id: int
63
+ title: str
64
+ score: float
65
+
66
+ # --- Helper Function ---
67
+ def check_model():
68
+ if not rec_engine or not rec_engine.index:
69
+ raise HTTPException(status_code=503, detail="AI Model is not loaded. Run ingestion first.")
70
+
71
+ # --- API Endpoints ---
72
+
73
+ @app.get("/")
74
+ def health_check():
75
+ """Simple check to see if server is running."""
76
+ return {"status": "online and active!!!", "model_loaded": rec_engine is not None}
77
+
78
+ @app.post("/search", response_model=List[MovieResponse])
79
+ def search_movies(request: SearchRequest):
80
+ """
81
+ Semantic Search: Convert query to vector -> Find nearest movies.
82
+ """
83
+ check_model()
84
+ results = rec_engine.recommend_on_text(request.query, k=request.k)
85
+ return results
86
+
87
+ @app.post("/recommend/vibe", response_model=dict)
88
+ def vibe_check(request: VibeRequest):
89
+ """
90
+ Recommends based on a mix of Tags and Description.
91
+ """
92
+ check_model()
93
+
94
+ # Construct the "Soup"
95
+ # We repeat tags to give them more weight in the vector space
96
+ tag_str = " ".join(request.tags) * 2
97
+ query_soup = f"{tag_str} {request.description}".strip()
98
+
99
+ if not query_soup:
100
+ raise HTTPException(status_code=400, detail="Please provide at least one tag or description.")
101
+
102
+ results = rec_engine.recommend_on_text(query_soup, k=request.k)
103
+
104
+ return {
105
+ "interpreted_query": query_soup,
106
+ "results": results
107
+ }
108
+
109
+ @app.post("/recommend/user", response_model=List[MovieResponse])
110
+ def recommend_for_user(request: UserHistoryRequest):
111
+ """
112
+ Takes a list of movie titles the user likes, averages their vectors,
113
+ and finds similar movies.
114
+ """
115
+ check_model()
116
+ results = rec_engine.recommend_for_user(request.liked_movies, k=request.k)
117
+
118
+ # Handle case where no movies were matched
119
+ if isinstance(results, str):
120
+ raise HTTPException(status_code=404, detail=results)
121
+
122
+ return results
123
+
124
+ @app.get("/recommend/movie/{title}", response_model=List[MovieResponse])
125
+ def recommend_similar_movie(title: str):
126
+ """
127
+ Classic 'Users who liked X also liked Y' (Content-based version).
128
+ """
129
+ check_model()
130
+ results = rec_engine.recommend_by_movie(title)
131
+
132
+ if isinstance(results, str):
133
+ raise HTTPException(status_code=404, detail=results)
134
+
135
+ return results
136
+
137
+ # --- Admin / Operations ---
138
+
139
+ def background_update_task():
140
+ """
141
+ Runs the ingestion script and reloads the model in memory.
142
+ """
143
+ print(" [BACKGROUND] Starting update process...")
144
+ # 1. Run the ingestion (Fetch from TMDB + Save to Disk)
145
+ run_weekly_update()
146
+
147
+ # 2. Reload the model in memory so the API sees the new movies immediately
148
+ print(" [BACKGROUND] Reloading model into RAM...")
149
+ rec_engine.load('models/')
150
+ print(" [BACKGROUND] Update complete. Model reloaded.")
151
+
152
+ @app.post("/admin/trigger-update")
153
+ def trigger_update(background_tasks: BackgroundTasks):
154
+ """
155
+ Manually triggers the 'Weekly Update' logic.
156
+ Runs in the background so the API doesn't freeze.
157
+ """
158
+ background_tasks.add_task(background_update_task)
159
+ return {"message": "Update process started in background. Check server logs for progress."}
160
+
161
+ # --- Entry Point ---
162
+ if __name__ == "__main__":
163
+ # Use this for debugging. In production, use the command line.
164
+ uvicorn.run(app, host="0.0.0.0", port=8000)
eda/.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
eda/Untitled.ipynb ADDED
@@ -0,0 +1,1251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "faab86b6-6af3-4b86-8288-534228ce01ea",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Collecting pandas\n",
14
+ " Using cached pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)\n",
15
+ "Requirement already satisfied: numpy in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (2.3.3)\n",
16
+ "Collecting seaborn\n",
17
+ " Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)\n",
18
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from pandas) (2.9.0.post0)\n",
19
+ "Collecting pytz>=2020.1 (from pandas)\n",
20
+ " Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)\n",
21
+ "Requirement already satisfied: tzdata>=2022.7 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from pandas) (2025.2)\n",
22
+ "Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)\n",
23
+ " Downloading matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)\n",
24
+ "Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
25
+ " Downloading contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.5 kB)\n",
26
+ "Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
27
+ " Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
28
+ "Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
29
+ " Downloading fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (113 kB)\n",
30
+ "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.2/113.2 kB\u001b[0m \u001b[31m276.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
31
+ "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
32
+ " Downloading kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (6.3 kB)\n",
33
+ "Requirement already satisfied: packaging>=20.0 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0)\n",
34
+ "Requirement already satisfied: pillow>=8 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.3.0)\n",
35
+ "Collecting pyparsing>=3 (from matplotlib!=3.6.1,>=3.4->seaborn)\n",
36
+ " Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)\n",
37
+ "Requirement already satisfied: six>=1.5 in /home/jermaine/Desktop/CineMatch/venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
38
+ "Using cached pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)\n",
39
+ "Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)\n",
40
+ "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.9/294.9 kB\u001b[0m \u001b[31m409.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m745.6 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
41
+ "\u001b[?25hDownloading matplotlib-3.10.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)\n",
42
+ "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:01\u001b[0m0m\n",
43
+ "\u001b[?25hUsing cached pytz-2025.2-py2.py3-none-any.whl (509 kB)\n",
44
+ "Downloading contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (355 kB)\n",
45
+ "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m355.2/355.2 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
46
+ "\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
47
+ "Downloading fonttools-4.61.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (5.0 MB)\n",
48
+ "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
49
+ "\u001b[?25hDownloading kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (1.4 MB)\n",
50
+ "\u001b[2K \u001b[38;2;114;156;31m���━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
51
+ "\u001b[?25hDownloading pyparsing-3.2.5-py3-none-any.whl (113 kB)\n",
52
+ "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m113.9/113.9 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m\n",
53
+ "\u001b[?25hInstalling collected packages: pytz, pyparsing, kiwisolver, fonttools, cycler, contourpy, pandas, matplotlib, seaborn\n",
54
+ "Successfully installed contourpy-1.3.3 cycler-0.12.1 fonttools-4.61.0 kiwisolver-1.4.9 matplotlib-3.10.7 pandas-2.3.3 pyparsing-3.2.5 pytz-2025.2 seaborn-0.13.2\n",
55
+ "\n",
56
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.3\u001b[0m\n",
57
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
58
+ ]
59
+ }
60
+ ],
61
+ "source": [
62
+ "!pip install pandas numpy seaborn"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 7,
68
+ "id": "6c83aa85-43a2-4941-8ec4-510010ed72c1",
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "import pandas as pd\n",
73
+ "import numpy as np\n",
74
+ "import seaborn as sns\n",
75
+ "import warnings\n",
76
+ "warnings.filterwarnings(\"ignore\")"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 8,
82
+ "id": "6c692a3d-e17e-458b-9cfd-9edc2b9c76e5",
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "df = pd.read_csv(\"~/Desktop/CineMatch/data/movies_metadata.csv\")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 9,
92
+ "id": "761e4049-76e8-4637-b34a-f922a8237836",
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "data": {
97
+ "text/html": [
98
+ "<div>\n",
99
+ "<style scoped>\n",
100
+ " .dataframe tbody tr th:only-of-type {\n",
101
+ " vertical-align: middle;\n",
102
+ " }\n",
103
+ "\n",
104
+ " .dataframe tbody tr th {\n",
105
+ " vertical-align: top;\n",
106
+ " }\n",
107
+ "\n",
108
+ " .dataframe thead th {\n",
109
+ " text-align: right;\n",
110
+ " }\n",
111
+ "</style>\n",
112
+ "<table border=\"1\" class=\"dataframe\">\n",
113
+ " <thead>\n",
114
+ " <tr style=\"text-align: right;\">\n",
115
+ " <th></th>\n",
116
+ " <th>adult</th>\n",
117
+ " <th>belongs_to_collection</th>\n",
118
+ " <th>budget</th>\n",
119
+ " <th>genres</th>\n",
120
+ " <th>homepage</th>\n",
121
+ " <th>id</th>\n",
122
+ " <th>imdb_id</th>\n",
123
+ " <th>original_language</th>\n",
124
+ " <th>original_title</th>\n",
125
+ " <th>overview</th>\n",
126
+ " <th>...</th>\n",
127
+ " <th>release_date</th>\n",
128
+ " <th>revenue</th>\n",
129
+ " <th>runtime</th>\n",
130
+ " <th>spoken_languages</th>\n",
131
+ " <th>status</th>\n",
132
+ " <th>tagline</th>\n",
133
+ " <th>title</th>\n",
134
+ " <th>video</th>\n",
135
+ " <th>vote_average</th>\n",
136
+ " <th>vote_count</th>\n",
137
+ " </tr>\n",
138
+ " </thead>\n",
139
+ " <tbody>\n",
140
+ " <tr>\n",
141
+ " <th>0</th>\n",
142
+ " <td>False</td>\n",
143
+ " <td>{'id': 10194, 'name': 'Toy Story Collection', ...</td>\n",
144
+ " <td>30000000</td>\n",
145
+ " <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
146
+ " <td>http://toystory.disney.com/toy-story</td>\n",
147
+ " <td>862</td>\n",
148
+ " <td>tt0114709</td>\n",
149
+ " <td>en</td>\n",
150
+ " <td>Toy Story</td>\n",
151
+ " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
152
+ " <td>...</td>\n",
153
+ " <td>1995-10-30</td>\n",
154
+ " <td>373554033.0</td>\n",
155
+ " <td>81.0</td>\n",
156
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
157
+ " <td>Released</td>\n",
158
+ " <td>NaN</td>\n",
159
+ " <td>Toy Story</td>\n",
160
+ " <td>False</td>\n",
161
+ " <td>7.7</td>\n",
162
+ " <td>5415.0</td>\n",
163
+ " </tr>\n",
164
+ " <tr>\n",
165
+ " <th>1</th>\n",
166
+ " <td>False</td>\n",
167
+ " <td>NaN</td>\n",
168
+ " <td>65000000</td>\n",
169
+ " <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
170
+ " <td>NaN</td>\n",
171
+ " <td>8844</td>\n",
172
+ " <td>tt0113497</td>\n",
173
+ " <td>en</td>\n",
174
+ " <td>Jumanji</td>\n",
175
+ " <td>When siblings Judy and Peter discover an encha...</td>\n",
176
+ " <td>...</td>\n",
177
+ " <td>1995-12-15</td>\n",
178
+ " <td>262797249.0</td>\n",
179
+ " <td>104.0</td>\n",
180
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n",
181
+ " <td>Released</td>\n",
182
+ " <td>Roll the dice and unleash the excitement!</td>\n",
183
+ " <td>Jumanji</td>\n",
184
+ " <td>False</td>\n",
185
+ " <td>6.9</td>\n",
186
+ " <td>2413.0</td>\n",
187
+ " </tr>\n",
188
+ " <tr>\n",
189
+ " <th>2</th>\n",
190
+ " <td>False</td>\n",
191
+ " <td>{'id': 119050, 'name': 'Grumpy Old Men Collect...</td>\n",
192
+ " <td>0</td>\n",
193
+ " <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
194
+ " <td>NaN</td>\n",
195
+ " <td>15602</td>\n",
196
+ " <td>tt0113228</td>\n",
197
+ " <td>en</td>\n",
198
+ " <td>Grumpier Old Men</td>\n",
199
+ " <td>A family wedding reignites the ancient feud be...</td>\n",
200
+ " <td>...</td>\n",
201
+ " <td>1995-12-22</td>\n",
202
+ " <td>0.0</td>\n",
203
+ " <td>101.0</td>\n",
204
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
205
+ " <td>Released</td>\n",
206
+ " <td>Still Yelling. Still Fighting. Still Ready for...</td>\n",
207
+ " <td>Grumpier Old Men</td>\n",
208
+ " <td>False</td>\n",
209
+ " <td>6.5</td>\n",
210
+ " <td>92.0</td>\n",
211
+ " </tr>\n",
212
+ " <tr>\n",
213
+ " <th>3</th>\n",
214
+ " <td>False</td>\n",
215
+ " <td>NaN</td>\n",
216
+ " <td>16000000</td>\n",
217
+ " <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
218
+ " <td>NaN</td>\n",
219
+ " <td>31357</td>\n",
220
+ " <td>tt0114885</td>\n",
221
+ " <td>en</td>\n",
222
+ " <td>Waiting to Exhale</td>\n",
223
+ " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
224
+ " <td>...</td>\n",
225
+ " <td>1995-12-22</td>\n",
226
+ " <td>81452156.0</td>\n",
227
+ " <td>127.0</td>\n",
228
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
229
+ " <td>Released</td>\n",
230
+ " <td>Friends are the people who let you be yourself...</td>\n",
231
+ " <td>Waiting to Exhale</td>\n",
232
+ " <td>False</td>\n",
233
+ " <td>6.1</td>\n",
234
+ " <td>34.0</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>4</th>\n",
238
+ " <td>False</td>\n",
239
+ " <td>{'id': 96871, 'name': 'Father of the Bride Col...</td>\n",
240
+ " <td>0</td>\n",
241
+ " <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
242
+ " <td>NaN</td>\n",
243
+ " <td>11862</td>\n",
244
+ " <td>tt0113041</td>\n",
245
+ " <td>en</td>\n",
246
+ " <td>Father of the Bride Part II</td>\n",
247
+ " <td>Just when George Banks has recovered from his ...</td>\n",
248
+ " <td>...</td>\n",
249
+ " <td>1995-02-10</td>\n",
250
+ " <td>76578911.0</td>\n",
251
+ " <td>106.0</td>\n",
252
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
253
+ " <td>Released</td>\n",
254
+ " <td>Just When His World Is Back To Normal... He's ...</td>\n",
255
+ " <td>Father of the Bride Part II</td>\n",
256
+ " <td>False</td>\n",
257
+ " <td>5.7</td>\n",
258
+ " <td>173.0</td>\n",
259
+ " </tr>\n",
260
+ " <tr>\n",
261
+ " <th>...</th>\n",
262
+ " <td>...</td>\n",
263
+ " <td>...</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>...</td>\n",
266
+ " <td>...</td>\n",
267
+ " <td>...</td>\n",
268
+ " <td>...</td>\n",
269
+ " <td>...</td>\n",
270
+ " <td>...</td>\n",
271
+ " <td>...</td>\n",
272
+ " <td>...</td>\n",
273
+ " <td>...</td>\n",
274
+ " <td>...</td>\n",
275
+ " <td>...</td>\n",
276
+ " <td>...</td>\n",
277
+ " <td>...</td>\n",
278
+ " <td>...</td>\n",
279
+ " <td>...</td>\n",
280
+ " <td>...</td>\n",
281
+ " <td>...</td>\n",
282
+ " <td>...</td>\n",
283
+ " </tr>\n",
284
+ " <tr>\n",
285
+ " <th>45461</th>\n",
286
+ " <td>False</td>\n",
287
+ " <td>NaN</td>\n",
288
+ " <td>0</td>\n",
289
+ " <td>[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...</td>\n",
290
+ " <td>http://www.imdb.com/title/tt6209470/</td>\n",
291
+ " <td>439050</td>\n",
292
+ " <td>tt6209470</td>\n",
293
+ " <td>fa</td>\n",
294
+ " <td>رگ خواب</td>\n",
295
+ " <td>Rising and falling between a man and woman.</td>\n",
296
+ " <td>...</td>\n",
297
+ " <td>NaN</td>\n",
298
+ " <td>0.0</td>\n",
299
+ " <td>90.0</td>\n",
300
+ " <td>[{'iso_639_1': 'fa', 'name': 'فارسی'}]</td>\n",
301
+ " <td>Released</td>\n",
302
+ " <td>Rising and falling between a man and woman</td>\n",
303
+ " <td>Subdue</td>\n",
304
+ " <td>False</td>\n",
305
+ " <td>4.0</td>\n",
306
+ " <td>1.0</td>\n",
307
+ " </tr>\n",
308
+ " <tr>\n",
309
+ " <th>45462</th>\n",
310
+ " <td>False</td>\n",
311
+ " <td>NaN</td>\n",
312
+ " <td>0</td>\n",
313
+ " <td>[{'id': 18, 'name': 'Drama'}]</td>\n",
314
+ " <td>NaN</td>\n",
315
+ " <td>111109</td>\n",
316
+ " <td>tt2028550</td>\n",
317
+ " <td>tl</td>\n",
318
+ " <td>Siglo ng Pagluluwal</td>\n",
319
+ " <td>An artist struggles to finish his work while a...</td>\n",
320
+ " <td>...</td>\n",
321
+ " <td>2011-11-17</td>\n",
322
+ " <td>0.0</td>\n",
323
+ " <td>360.0</td>\n",
324
+ " <td>[{'iso_639_1': 'tl', 'name': ''}]</td>\n",
325
+ " <td>Released</td>\n",
326
+ " <td>NaN</td>\n",
327
+ " <td>Century of Birthing</td>\n",
328
+ " <td>False</td>\n",
329
+ " <td>9.0</td>\n",
330
+ " <td>3.0</td>\n",
331
+ " </tr>\n",
332
+ " <tr>\n",
333
+ " <th>45463</th>\n",
334
+ " <td>False</td>\n",
335
+ " <td>NaN</td>\n",
336
+ " <td>0</td>\n",
337
+ " <td>[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...</td>\n",
338
+ " <td>NaN</td>\n",
339
+ " <td>67758</td>\n",
340
+ " <td>tt0303758</td>\n",
341
+ " <td>en</td>\n",
342
+ " <td>Betrayal</td>\n",
343
+ " <td>When one of her hits goes wrong, a professiona...</td>\n",
344
+ " <td>...</td>\n",
345
+ " <td>2003-08-01</td>\n",
346
+ " <td>0.0</td>\n",
347
+ " <td>90.0</td>\n",
348
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
349
+ " <td>Released</td>\n",
350
+ " <td>A deadly game of wits.</td>\n",
351
+ " <td>Betrayal</td>\n",
352
+ " <td>False</td>\n",
353
+ " <td>3.8</td>\n",
354
+ " <td>6.0</td>\n",
355
+ " </tr>\n",
356
+ " <tr>\n",
357
+ " <th>45464</th>\n",
358
+ " <td>False</td>\n",
359
+ " <td>NaN</td>\n",
360
+ " <td>0</td>\n",
361
+ " <td>[]</td>\n",
362
+ " <td>NaN</td>\n",
363
+ " <td>227506</td>\n",
364
+ " <td>tt0008536</td>\n",
365
+ " <td>en</td>\n",
366
+ " <td>Satana likuyushchiy</td>\n",
367
+ " <td>In a small town live two brothers, one a minis...</td>\n",
368
+ " <td>...</td>\n",
369
+ " <td>1917-10-21</td>\n",
370
+ " <td>0.0</td>\n",
371
+ " <td>87.0</td>\n",
372
+ " <td>[]</td>\n",
373
+ " <td>Released</td>\n",
374
+ " <td>NaN</td>\n",
375
+ " <td>Satan Triumphant</td>\n",
376
+ " <td>False</td>\n",
377
+ " <td>0.0</td>\n",
378
+ " <td>0.0</td>\n",
379
+ " </tr>\n",
380
+ " <tr>\n",
381
+ " <th>45465</th>\n",
382
+ " <td>False</td>\n",
383
+ " <td>NaN</td>\n",
384
+ " <td>0</td>\n",
385
+ " <td>[]</td>\n",
386
+ " <td>NaN</td>\n",
387
+ " <td>461257</td>\n",
388
+ " <td>tt6980792</td>\n",
389
+ " <td>en</td>\n",
390
+ " <td>Queerama</td>\n",
391
+ " <td>50 years after decriminalisation of homosexual...</td>\n",
392
+ " <td>...</td>\n",
393
+ " <td>2017-06-09</td>\n",
394
+ " <td>0.0</td>\n",
395
+ " <td>75.0</td>\n",
396
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
397
+ " <td>Released</td>\n",
398
+ " <td>NaN</td>\n",
399
+ " <td>Queerama</td>\n",
400
+ " <td>False</td>\n",
401
+ " <td>0.0</td>\n",
402
+ " <td>0.0</td>\n",
403
+ " </tr>\n",
404
+ " </tbody>\n",
405
+ "</table>\n",
406
+ "<p>45466 rows × 24 columns</p>\n",
407
+ "</div>"
408
+ ],
409
+ "text/plain": [
410
+ " adult belongs_to_collection budget \\\n",
411
+ "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
412
+ "1 False NaN 65000000 \n",
413
+ "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
414
+ "3 False NaN 16000000 \n",
415
+ "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
416
+ "... ... ... ... \n",
417
+ "45461 False NaN 0 \n",
418
+ "45462 False NaN 0 \n",
419
+ "45463 False NaN 0 \n",
420
+ "45464 False NaN 0 \n",
421
+ "45465 False NaN 0 \n",
422
+ "\n",
423
+ " genres \\\n",
424
+ "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
425
+ "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
426
+ "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
427
+ "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
428
+ "4 [{'id': 35, 'name': 'Comedy'}] \n",
429
+ "... ... \n",
430
+ "45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n",
431
+ "45462 [{'id': 18, 'name': 'Drama'}] \n",
432
+ "45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n",
433
+ "45464 [] \n",
434
+ "45465 [] \n",
435
+ "\n",
436
+ " homepage id imdb_id \\\n",
437
+ "0 http://toystory.disney.com/toy-story 862 tt0114709 \n",
438
+ "1 NaN 8844 tt0113497 \n",
439
+ "2 NaN 15602 tt0113228 \n",
440
+ "3 NaN 31357 tt0114885 \n",
441
+ "4 NaN 11862 tt0113041 \n",
442
+ "... ... ... ... \n",
443
+ "45461 http://www.imdb.com/title/tt6209470/ 439050 tt6209470 \n",
444
+ "45462 NaN 111109 tt2028550 \n",
445
+ "45463 NaN 67758 tt0303758 \n",
446
+ "45464 NaN 227506 tt0008536 \n",
447
+ "45465 NaN 461257 tt6980792 \n",
448
+ "\n",
449
+ " original_language original_title \\\n",
450
+ "0 en Toy Story \n",
451
+ "1 en Jumanji \n",
452
+ "2 en Grumpier Old Men \n",
453
+ "3 en Waiting to Exhale \n",
454
+ "4 en Father of the Bride Part II \n",
455
+ "... ... ... \n",
456
+ "45461 fa رگ خواب \n",
457
+ "45462 tl Siglo ng Pagluluwal \n",
458
+ "45463 en Betrayal \n",
459
+ "45464 en Satana likuyushchiy \n",
460
+ "45465 en Queerama \n",
461
+ "\n",
462
+ " overview ... release_date \\\n",
463
+ "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
464
+ "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
465
+ "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
466
+ "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
467
+ "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
468
+ "... ... ... ... \n",
469
+ "45461 Rising and falling between a man and woman. ... NaN \n",
470
+ "45462 An artist struggles to finish his work while a... ... 2011-11-17 \n",
471
+ "45463 When one of her hits goes wrong, a professiona... ... 2003-08-01 \n",
472
+ "45464 In a small town live two brothers, one a minis... ... 1917-10-21 \n",
473
+ "45465 50 years after decriminalisation of homosexual... ... 2017-06-09 \n",
474
+ "\n",
475
+ " revenue runtime spoken_languages \\\n",
476
+ "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
477
+ "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
478
+ "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
479
+ "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
480
+ "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
481
+ "... ... ... ... \n",
482
+ "45461 0.0 90.0 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n",
483
+ "45462 0.0 360.0 [{'iso_639_1': 'tl', 'name': ''}] \n",
484
+ "45463 0.0 90.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
485
+ "45464 0.0 87.0 [] \n",
486
+ "45465 0.0 75.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
487
+ "\n",
488
+ " status tagline \\\n",
489
+ "0 Released NaN \n",
490
+ "1 Released Roll the dice and unleash the excitement! \n",
491
+ "2 Released Still Yelling. Still Fighting. Still Ready for... \n",
492
+ "3 Released Friends are the people who let you be yourself... \n",
493
+ "4 Released Just When His World Is Back To Normal... He's ... \n",
494
+ "... ... ... \n",
495
+ "45461 Released Rising and falling between a man and woman \n",
496
+ "45462 Released NaN \n",
497
+ "45463 Released A deadly game of wits. \n",
498
+ "45464 Released NaN \n",
499
+ "45465 Released NaN \n",
500
+ "\n",
501
+ " title video vote_average vote_count \n",
502
+ "0 Toy Story False 7.7 5415.0 \n",
503
+ "1 Jumanji False 6.9 2413.0 \n",
504
+ "2 Grumpier Old Men False 6.5 92.0 \n",
505
+ "3 Waiting to Exhale False 6.1 34.0 \n",
506
+ "4 Father of the Bride Part II False 5.7 173.0 \n",
507
+ "... ... ... ... ... \n",
508
+ "45461 Subdue False 4.0 1.0 \n",
509
+ "45462 Century of Birthing False 9.0 3.0 \n",
510
+ "45463 Betrayal False 3.8 6.0 \n",
511
+ "45464 Satan Triumphant False 0.0 0.0 \n",
512
+ "45465 Queerama False 0.0 0.0 \n",
513
+ "\n",
514
+ "[45466 rows x 24 columns]"
515
+ ]
516
+ },
517
+ "execution_count": 9,
518
+ "metadata": {},
519
+ "output_type": "execute_result"
520
+ }
521
+ ],
522
+ "source": [
523
+ "df"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 10,
529
+ "id": "010b9a7e-2075-4119-9b3e-6a79970245c0",
530
+ "metadata": {},
531
+ "outputs": [
532
+ {
533
+ "name": "stdout",
534
+ "output_type": "stream",
535
+ "text": [
536
+ "<class 'pandas.core.frame.DataFrame'>\n",
537
+ "RangeIndex: 45466 entries, 0 to 45465\n",
538
+ "Data columns (total 24 columns):\n",
539
+ " # Column Non-Null Count Dtype \n",
540
+ "--- ------ -------------- ----- \n",
541
+ " 0 adult 45466 non-null object \n",
542
+ " 1 belongs_to_collection 4494 non-null object \n",
543
+ " 2 budget 45466 non-null object \n",
544
+ " 3 genres 45466 non-null object \n",
545
+ " 4 homepage 7782 non-null object \n",
546
+ " 5 id 45466 non-null object \n",
547
+ " 6 imdb_id 45449 non-null object \n",
548
+ " 7 original_language 45455 non-null object \n",
549
+ " 8 original_title 45466 non-null object \n",
550
+ " 9 overview 44512 non-null object \n",
551
+ " 10 popularity 45461 non-null object \n",
552
+ " 11 poster_path 45080 non-null object \n",
553
+ " 12 production_companies 45463 non-null object \n",
554
+ " 13 production_countries 45463 non-null object \n",
555
+ " 14 release_date 45379 non-null object \n",
556
+ " 15 revenue 45460 non-null float64\n",
557
+ " 16 runtime 45203 non-null float64\n",
558
+ " 17 spoken_languages 45460 non-null object \n",
559
+ " 18 status 45379 non-null object \n",
560
+ " 19 tagline 20412 non-null object \n",
561
+ " 20 title 45460 non-null object \n",
562
+ " 21 video 45460 non-null object \n",
563
+ " 22 vote_average 45460 non-null float64\n",
564
+ " 23 vote_count 45460 non-null float64\n",
565
+ "dtypes: float64(4), object(20)\n",
566
+ "memory usage: 8.3+ MB\n"
567
+ ]
568
+ }
569
+ ],
570
+ "source": [
571
+ "df.info()"
572
+ ]
573
+ },
574
+ {
575
+ "cell_type": "code",
576
+ "execution_count": 11,
577
+ "id": "e17d1cb7-6446-40c7-acf8-05934b5b66c0",
578
+ "metadata": {},
579
+ "outputs": [
580
+ {
581
+ "data": {
582
+ "text/plain": [
583
+ "adult False\n",
584
+ "belongs_to_collection {'id': 10194, 'name': 'Toy Story Collection', ...\n",
585
+ "budget 30000000\n",
586
+ "genres [{'id': 16, 'name': 'Animation'}, {'id': 35, '...\n",
587
+ "homepage http://toystory.disney.com/toy-story\n",
588
+ "id 862\n",
589
+ "imdb_id tt0114709\n",
590
+ "original_language en\n",
591
+ "original_title Toy Story\n",
592
+ "overview Led by Woody, Andy's toys live happily in his ...\n",
593
+ "popularity 21.946943\n",
594
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg\n",
595
+ "production_companies [{'name': 'Pixar Animation Studios', 'id': 3}]\n",
596
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n",
597
+ "release_date 1995-10-30\n",
598
+ "revenue 373554033.0\n",
599
+ "runtime 81.0\n",
600
+ "spoken_languages [{'iso_639_1': 'en', 'name': 'English'}]\n",
601
+ "status Released\n",
602
+ "tagline NaN\n",
603
+ "title Toy Story\n",
604
+ "video False\n",
605
+ "vote_average 7.7\n",
606
+ "vote_count 5415.0\n",
607
+ "Name: 0, dtype: object"
608
+ ]
609
+ },
610
+ "execution_count": 11,
611
+ "metadata": {},
612
+ "output_type": "execute_result"
613
+ }
614
+ ],
615
+ "source": [
616
+ "df.iloc[0]"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 14,
622
+ "id": "0dc44094-31b5-4870-81db-16d3bc96e59e",
623
+ "metadata": {},
624
+ "outputs": [],
625
+ "source": [
626
+ "df.drop(['vote_count','vote_average','video'], axis=1,inplace=True)\n"
627
+ ]
628
+ },
629
+ {
630
+ "cell_type": "code",
631
+ "execution_count": 18,
632
+ "id": "12d1ffb9-5cf2-4f7d-8a92-85c8471233bd",
633
+ "metadata": {},
634
+ "outputs": [
635
+ {
636
+ "data": {
637
+ "text/plain": [
638
+ "\"{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}\""
639
+ ]
640
+ },
641
+ "execution_count": 18,
642
+ "metadata": {},
643
+ "output_type": "execute_result"
644
+ }
645
+ ],
646
+ "source": [
647
+ "df['belongs_to_collection'][2]"
648
+ ]
649
+ },
650
+ {
651
+ "cell_type": "code",
652
+ "execution_count": 20,
653
+ "id": "33cce957-eed8-429a-b9f7-1ce83e4ec49c",
654
+ "metadata": {},
655
+ "outputs": [],
656
+ "source": [
657
+ "df.drop('belongs_to_collection',axis=1,inplace=True)"
658
+ ]
659
+ },
660
+ {
661
+ "cell_type": "code",
662
+ "execution_count": 21,
663
+ "id": "ced88473-c105-4e0a-808b-f9903c8b8643",
664
+ "metadata": {},
665
+ "outputs": [],
666
+ "source": [
667
+ "df.drop(['homepage','status',],axis=1,inplace=True)"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": 22,
673
+ "id": "519facbd-91c9-4784-aa32-44a0d9edb24f",
674
+ "metadata": {},
675
+ "outputs": [
676
+ {
677
+ "data": {
678
+ "text/html": [
679
+ "<div>\n",
680
+ "<style scoped>\n",
681
+ " .dataframe tbody tr th:only-of-type {\n",
682
+ " vertical-align: middle;\n",
683
+ " }\n",
684
+ "\n",
685
+ " .dataframe tbody tr th {\n",
686
+ " vertical-align: top;\n",
687
+ " }\n",
688
+ "\n",
689
+ " .dataframe thead th {\n",
690
+ " text-align: right;\n",
691
+ " }\n",
692
+ "</style>\n",
693
+ "<table border=\"1\" class=\"dataframe\">\n",
694
+ " <thead>\n",
695
+ " <tr style=\"text-align: right;\">\n",
696
+ " <th></th>\n",
697
+ " <th>adult</th>\n",
698
+ " <th>budget</th>\n",
699
+ " <th>genres</th>\n",
700
+ " <th>id</th>\n",
701
+ " <th>imdb_id</th>\n",
702
+ " <th>original_language</th>\n",
703
+ " <th>original_title</th>\n",
704
+ " <th>overview</th>\n",
705
+ " <th>popularity</th>\n",
706
+ " <th>poster_path</th>\n",
707
+ " <th>production_companies</th>\n",
708
+ " <th>production_countries</th>\n",
709
+ " <th>release_date</th>\n",
710
+ " <th>revenue</th>\n",
711
+ " <th>runtime</th>\n",
712
+ " <th>spoken_languages</th>\n",
713
+ " <th>tagline</th>\n",
714
+ " <th>title</th>\n",
715
+ " </tr>\n",
716
+ " </thead>\n",
717
+ " <tbody>\n",
718
+ " <tr>\n",
719
+ " <th>0</th>\n",
720
+ " <td>False</td>\n",
721
+ " <td>30000000</td>\n",
722
+ " <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n",
723
+ " <td>862</td>\n",
724
+ " <td>tt0114709</td>\n",
725
+ " <td>en</td>\n",
726
+ " <td>Toy Story</td>\n",
727
+ " <td>Led by Woody, Andy's toys live happily in his ...</td>\n",
728
+ " <td>21.946943</td>\n",
729
+ " <td>/rhIRbceoE9lR4veEXuwCC2wARtG.jpg</td>\n",
730
+ " <td>[{'name': 'Pixar Animation Studios', 'id': 3}]</td>\n",
731
+ " <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
732
+ " <td>1995-10-30</td>\n",
733
+ " <td>373554033.0</td>\n",
734
+ " <td>81.0</td>\n",
735
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
736
+ " <td>NaN</td>\n",
737
+ " <td>Toy Story</td>\n",
738
+ " </tr>\n",
739
+ " <tr>\n",
740
+ " <th>1</th>\n",
741
+ " <td>False</td>\n",
742
+ " <td>65000000</td>\n",
743
+ " <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n",
744
+ " <td>8844</td>\n",
745
+ " <td>tt0113497</td>\n",
746
+ " <td>en</td>\n",
747
+ " <td>Jumanji</td>\n",
748
+ " <td>When siblings Judy and Peter discover an encha...</td>\n",
749
+ " <td>17.015539</td>\n",
750
+ " <td>/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg</td>\n",
751
+ " <td>[{'name': 'TriStar Pictures', 'id': 559}, {'na...</td>\n",
752
+ " <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
753
+ " <td>1995-12-15</td>\n",
754
+ " <td>262797249.0</td>\n",
755
+ " <td>104.0</td>\n",
756
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n",
757
+ " <td>Roll the dice and unleash the excitement!</td>\n",
758
+ " <td>Jumanji</td>\n",
759
+ " </tr>\n",
760
+ " <tr>\n",
761
+ " <th>2</th>\n",
762
+ " <td>False</td>\n",
763
+ " <td>0</td>\n",
764
+ " <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n",
765
+ " <td>15602</td>\n",
766
+ " <td>tt0113228</td>\n",
767
+ " <td>en</td>\n",
768
+ " <td>Grumpier Old Men</td>\n",
769
+ " <td>A family wedding reignites the ancient feud be...</td>\n",
770
+ " <td>11.7129</td>\n",
771
+ " <td>/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg</td>\n",
772
+ " <td>[{'name': 'Warner Bros.', 'id': 6194}, {'name'...</td>\n",
773
+ " <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
774
+ " <td>1995-12-22</td>\n",
775
+ " <td>0.0</td>\n",
776
+ " <td>101.0</td>\n",
777
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
778
+ " <td>Still Yelling. Still Fighting. Still Ready for...</td>\n",
779
+ " <td>Grumpier Old Men</td>\n",
780
+ " </tr>\n",
781
+ " <tr>\n",
782
+ " <th>3</th>\n",
783
+ " <td>False</td>\n",
784
+ " <td>16000000</td>\n",
785
+ " <td>[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...</td>\n",
786
+ " <td>31357</td>\n",
787
+ " <td>tt0114885</td>\n",
788
+ " <td>en</td>\n",
789
+ " <td>Waiting to Exhale</td>\n",
790
+ " <td>Cheated on, mistreated and stepped on, the wom...</td>\n",
791
+ " <td>3.859495</td>\n",
792
+ " <td>/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg</td>\n",
793
+ " <td>[{'name': 'Twentieth Century Fox Film Corporat...</td>\n",
794
+ " <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
795
+ " <td>1995-12-22</td>\n",
796
+ " <td>81452156.0</td>\n",
797
+ " <td>127.0</td>\n",
798
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
799
+ " <td>Friends are the people who let you be yourself...</td>\n",
800
+ " <td>Waiting to Exhale</td>\n",
801
+ " </tr>\n",
802
+ " <tr>\n",
803
+ " <th>4</th>\n",
804
+ " <td>False</td>\n",
805
+ " <td>0</td>\n",
806
+ " <td>[{'id': 35, 'name': 'Comedy'}]</td>\n",
807
+ " <td>11862</td>\n",
808
+ " <td>tt0113041</td>\n",
809
+ " <td>en</td>\n",
810
+ " <td>Father of the Bride Part II</td>\n",
811
+ " <td>Just when George Banks has recovered from his ...</td>\n",
812
+ " <td>8.387519</td>\n",
813
+ " <td>/e64sOI48hQXyru7naBFyssKFxVd.jpg</td>\n",
814
+ " <td>[{'name': 'Sandollar Productions', 'id': 5842}...</td>\n",
815
+ " <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
816
+ " <td>1995-02-10</td>\n",
817
+ " <td>76578911.0</td>\n",
818
+ " <td>106.0</td>\n",
819
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
820
+ " <td>Just When His World Is Back To Normal... He's ...</td>\n",
821
+ " <td>Father of the Bride Part II</td>\n",
822
+ " </tr>\n",
823
+ " <tr>\n",
824
+ " <th>...</th>\n",
825
+ " <td>...</td>\n",
826
+ " <td>...</td>\n",
827
+ " <td>...</td>\n",
828
+ " <td>...</td>\n",
829
+ " <td>...</td>\n",
830
+ " <td>...</td>\n",
831
+ " <td>...</td>\n",
832
+ " <td>...</td>\n",
833
+ " <td>...</td>\n",
834
+ " <td>...</td>\n",
835
+ " <td>...</td>\n",
836
+ " <td>...</td>\n",
837
+ " <td>...</td>\n",
838
+ " <td>...</td>\n",
839
+ " <td>...</td>\n",
840
+ " <td>...</td>\n",
841
+ " <td>...</td>\n",
842
+ " <td>...</td>\n",
843
+ " </tr>\n",
844
+ " <tr>\n",
845
+ " <th>45461</th>\n",
846
+ " <td>False</td>\n",
847
+ " <td>0</td>\n",
848
+ " <td>[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...</td>\n",
849
+ " <td>439050</td>\n",
850
+ " <td>tt6209470</td>\n",
851
+ " <td>fa</td>\n",
852
+ " <td>رگ خواب</td>\n",
853
+ " <td>Rising and falling between a man and woman.</td>\n",
854
+ " <td>0.072051</td>\n",
855
+ " <td>/jldsYflnId4tTWPx8es3uzsB1I8.jpg</td>\n",
856
+ " <td>[]</td>\n",
857
+ " <td>[{'iso_3166_1': 'IR', 'name': 'Iran'}]</td>\n",
858
+ " <td>NaN</td>\n",
859
+ " <td>0.0</td>\n",
860
+ " <td>90.0</td>\n",
861
+ " <td>[{'iso_639_1': 'fa', 'name': 'فارسی'}]</td>\n",
862
+ " <td>Rising and falling between a man and woman</td>\n",
863
+ " <td>Subdue</td>\n",
864
+ " </tr>\n",
865
+ " <tr>\n",
866
+ " <th>45462</th>\n",
867
+ " <td>False</td>\n",
868
+ " <td>0</td>\n",
869
+ " <td>[{'id': 18, 'name': 'Drama'}]</td>\n",
870
+ " <td>111109</td>\n",
871
+ " <td>tt2028550</td>\n",
872
+ " <td>tl</td>\n",
873
+ " <td>Siglo ng Pagluluwal</td>\n",
874
+ " <td>An artist struggles to finish his work while a...</td>\n",
875
+ " <td>0.178241</td>\n",
876
+ " <td>/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg</td>\n",
877
+ " <td>[{'name': 'Sine Olivia', 'id': 19653}]</td>\n",
878
+ " <td>[{'iso_3166_1': 'PH', 'name': 'Philippines'}]</td>\n",
879
+ " <td>2011-11-17</td>\n",
880
+ " <td>0.0</td>\n",
881
+ " <td>360.0</td>\n",
882
+ " <td>[{'iso_639_1': 'tl', 'name': ''}]</td>\n",
883
+ " <td>NaN</td>\n",
884
+ " <td>Century of Birthing</td>\n",
885
+ " </tr>\n",
886
+ " <tr>\n",
887
+ " <th>45463</th>\n",
888
+ " <td>False</td>\n",
889
+ " <td>0</td>\n",
890
+ " <td>[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...</td>\n",
891
+ " <td>67758</td>\n",
892
+ " <td>tt0303758</td>\n",
893
+ " <td>en</td>\n",
894
+ " <td>Betrayal</td>\n",
895
+ " <td>When one of her hits goes wrong, a professiona...</td>\n",
896
+ " <td>0.903007</td>\n",
897
+ " <td>/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg</td>\n",
898
+ " <td>[{'name': 'American World Pictures', 'id': 6165}]</td>\n",
899
+ " <td>[{'iso_3166_1': 'US', 'name': 'United States o...</td>\n",
900
+ " <td>2003-08-01</td>\n",
901
+ " <td>0.0</td>\n",
902
+ " <td>90.0</td>\n",
903
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
904
+ " <td>A deadly game of wits.</td>\n",
905
+ " <td>Betrayal</td>\n",
906
+ " </tr>\n",
907
+ " <tr>\n",
908
+ " <th>45464</th>\n",
909
+ " <td>False</td>\n",
910
+ " <td>0</td>\n",
911
+ " <td>[]</td>\n",
912
+ " <td>227506</td>\n",
913
+ " <td>tt0008536</td>\n",
914
+ " <td>en</td>\n",
915
+ " <td>Satana likuyushchiy</td>\n",
916
+ " <td>In a small town live two brothers, one a minis...</td>\n",
917
+ " <td>0.003503</td>\n",
918
+ " <td>/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg</td>\n",
919
+ " <td>[{'name': 'Yermoliev', 'id': 88753}]</td>\n",
920
+ " <td>[{'iso_3166_1': 'RU', 'name': 'Russia'}]</td>\n",
921
+ " <td>1917-10-21</td>\n",
922
+ " <td>0.0</td>\n",
923
+ " <td>87.0</td>\n",
924
+ " <td>[]</td>\n",
925
+ " <td>NaN</td>\n",
926
+ " <td>Satan Triumphant</td>\n",
927
+ " </tr>\n",
928
+ " <tr>\n",
929
+ " <th>45465</th>\n",
930
+ " <td>False</td>\n",
931
+ " <td>0</td>\n",
932
+ " <td>[]</td>\n",
933
+ " <td>461257</td>\n",
934
+ " <td>tt6980792</td>\n",
935
+ " <td>en</td>\n",
936
+ " <td>Queerama</td>\n",
937
+ " <td>50 years after decriminalisation of homosexual...</td>\n",
938
+ " <td>0.163015</td>\n",
939
+ " <td>/s5UkZt6NTsrS7ZF0Rh8nzupRlIU.jpg</td>\n",
940
+ " <td>[]</td>\n",
941
+ " <td>[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}]</td>\n",
942
+ " <td>2017-06-09</td>\n",
943
+ " <td>0.0</td>\n",
944
+ " <td>75.0</td>\n",
945
+ " <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n",
946
+ " <td>NaN</td>\n",
947
+ " <td>Queerama</td>\n",
948
+ " </tr>\n",
949
+ " </tbody>\n",
950
+ "</table>\n",
951
+ "<p>45466 rows × 18 columns</p>\n",
952
+ "</div>"
953
+ ],
954
+ "text/plain": [
955
+ " adult budget genres \\\n",
956
+ "0 False 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
957
+ "1 False 65000000 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
958
+ "2 False 0 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
959
+ "3 False 16000000 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
960
+ "4 False 0 [{'id': 35, 'name': 'Comedy'}] \n",
961
+ "... ... ... ... \n",
962
+ "45461 False 0 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n",
963
+ "45462 False 0 [{'id': 18, 'name': 'Drama'}] \n",
964
+ "45463 False 0 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n",
965
+ "45464 False 0 [] \n",
966
+ "45465 False 0 [] \n",
967
+ "\n",
968
+ " id imdb_id original_language original_title \\\n",
969
+ "0 862 tt0114709 en Toy Story \n",
970
+ "1 8844 tt0113497 en Jumanji \n",
971
+ "2 15602 tt0113228 en Grumpier Old Men \n",
972
+ "3 31357 tt0114885 en Waiting to Exhale \n",
973
+ "4 11862 tt0113041 en Father of the Bride Part II \n",
974
+ "... ... ... ... ... \n",
975
+ "45461 439050 tt6209470 fa رگ خواب \n",
976
+ "45462 111109 tt2028550 tl Siglo ng Pagluluwal \n",
977
+ "45463 67758 tt0303758 en Betrayal \n",
978
+ "45464 227506 tt0008536 en Satana likuyushchiy \n",
979
+ "45465 461257 tt6980792 en Queerama \n",
980
+ "\n",
981
+ " overview popularity \\\n",
982
+ "0 Led by Woody, Andy's toys live happily in his ... 21.946943 \n",
983
+ "1 When siblings Judy and Peter discover an encha... 17.015539 \n",
984
+ "2 A family wedding reignites the ancient feud be... 11.7129 \n",
985
+ "3 Cheated on, mistreated and stepped on, the wom... 3.859495 \n",
986
+ "4 Just when George Banks has recovered from his ... 8.387519 \n",
987
+ "... ... ... \n",
988
+ "45461 Rising and falling between a man and woman. 0.072051 \n",
989
+ "45462 An artist struggles to finish his work while a... 0.178241 \n",
990
+ "45463 When one of her hits goes wrong, a professiona... 0.903007 \n",
991
+ "45464 In a small town live two brothers, one a minis... 0.003503 \n",
992
+ "45465 50 years after decriminalisation of homosexual... 0.163015 \n",
993
+ "\n",
994
+ " poster_path \\\n",
995
+ "0 /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
996
+ "1 /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg \n",
997
+ "2 /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg \n",
998
+ "3 /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg \n",
999
+ "4 /e64sOI48hQXyru7naBFyssKFxVd.jpg \n",
1000
+ "... ... \n",
1001
+ "45461 /jldsYflnId4tTWPx8es3uzsB1I8.jpg \n",
1002
+ "45462 /xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg \n",
1003
+ "45463 /d5bX92nDsISNhu3ZT69uHwmfCGw.jpg \n",
1004
+ "45464 /aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg \n",
1005
+ "45465 /s5UkZt6NTsrS7ZF0Rh8nzupRlIU.jpg \n",
1006
+ "\n",
1007
+ " production_companies \\\n",
1008
+ "0 [{'name': 'Pixar Animation Studios', 'id': 3}] \n",
1009
+ "1 [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n",
1010
+ "2 [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n",
1011
+ "3 [{'name': 'Twentieth Century Fox Film Corporat... \n",
1012
+ "4 [{'name': 'Sandollar Productions', 'id': 5842}... \n",
1013
+ "... ... \n",
1014
+ "45461 [] \n",
1015
+ "45462 [{'name': 'Sine Olivia', 'id': 19653}] \n",
1016
+ "45463 [{'name': 'American World Pictures', 'id': 6165}] \n",
1017
+ "45464 [{'name': 'Yermoliev', 'id': 88753}] \n",
1018
+ "45465 [] \n",
1019
+ "\n",
1020
+ " production_countries release_date \\\n",
1021
+ "0 [{'iso_3166_1': 'US', 'name': 'United States o... 1995-10-30 \n",
1022
+ "1 [{'iso_3166_1': 'US', 'name': 'United States o... 1995-12-15 \n",
1023
+ "2 [{'iso_3166_1': 'US', 'name': 'United States o... 1995-12-22 \n",
1024
+ "3 [{'iso_3166_1': 'US', 'name': 'United States o... 1995-12-22 \n",
1025
+ "4 [{'iso_3166_1': 'US', 'name': 'United States o... 1995-02-10 \n",
1026
+ "... ... ... \n",
1027
+ "45461 [{'iso_3166_1': 'IR', 'name': 'Iran'}] NaN \n",
1028
+ "45462 [{'iso_3166_1': 'PH', 'name': 'Philippines'}] 2011-11-17 \n",
1029
+ "45463 [{'iso_3166_1': 'US', 'name': 'United States o... 2003-08-01 \n",
1030
+ "45464 [{'iso_3166_1': 'RU', 'name': 'Russia'}] 1917-10-21 \n",
1031
+ "45465 [{'iso_3166_1': 'GB', 'name': 'United Kingdom'}] 2017-06-09 \n",
1032
+ "\n",
1033
+ " revenue runtime \\\n",
1034
+ "0 373554033.0 81.0 \n",
1035
+ "1 262797249.0 104.0 \n",
1036
+ "2 0.0 101.0 \n",
1037
+ "3 81452156.0 127.0 \n",
1038
+ "4 76578911.0 106.0 \n",
1039
+ "... ... ... \n",
1040
+ "45461 0.0 90.0 \n",
1041
+ "45462 0.0 360.0 \n",
1042
+ "45463 0.0 90.0 \n",
1043
+ "45464 0.0 87.0 \n",
1044
+ "45465 0.0 75.0 \n",
1045
+ "\n",
1046
+ " spoken_languages \\\n",
1047
+ "0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
1048
+ "1 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
1049
+ "2 [{'iso_639_1': 'en', 'name': 'English'}] \n",
1050
+ "3 [{'iso_639_1': 'en', 'name': 'English'}] \n",
1051
+ "4 [{'iso_639_1': 'en', 'name': 'English'}] \n",
1052
+ "... ... \n",
1053
+ "45461 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n",
1054
+ "45462 [{'iso_639_1': 'tl', 'name': ''}] \n",
1055
+ "45463 [{'iso_639_1': 'en', 'name': 'English'}] \n",
1056
+ "45464 [] \n",
1057
+ "45465 [{'iso_639_1': 'en', 'name': 'English'}] \n",
1058
+ "\n",
1059
+ " tagline \\\n",
1060
+ "0 NaN \n",
1061
+ "1 Roll the dice and unleash the excitement! \n",
1062
+ "2 Still Yelling. Still Fighting. Still Ready for... \n",
1063
+ "3 Friends are the people who let you be yourself... \n",
1064
+ "4 Just When His World Is Back To Normal... He's ... \n",
1065
+ "... ... \n",
1066
+ "45461 Rising and falling between a man and woman \n",
1067
+ "45462 NaN \n",
1068
+ "45463 A deadly game of wits. \n",
1069
+ "45464 NaN \n",
1070
+ "45465 NaN \n",
1071
+ "\n",
1072
+ " title \n",
1073
+ "0 Toy Story \n",
1074
+ "1 Jumanji \n",
1075
+ "2 Grumpier Old Men \n",
1076
+ "3 Waiting to Exhale \n",
1077
+ "4 Father of the Bride Part II \n",
1078
+ "... ... \n",
1079
+ "45461 Subdue \n",
1080
+ "45462 Century of Birthing \n",
1081
+ "45463 Betrayal \n",
1082
+ "45464 Satan Triumphant \n",
1083
+ "45465 Queerama \n",
1084
+ "\n",
1085
+ "[45466 rows x 18 columns]"
1086
+ ]
1087
+ },
1088
+ "execution_count": 22,
1089
+ "metadata": {},
1090
+ "output_type": "execute_result"
1091
+ }
1092
+ ],
1093
+ "source": [
1094
+ "df"
1095
+ ]
1096
+ },
1097
+ {
1098
+ "cell_type": "code",
1099
+ "execution_count": 23,
1100
+ "id": "0f83fc57-1c94-44a1-a4b1-b51ca334097f",
1101
+ "metadata": {},
1102
+ "outputs": [
1103
+ {
1104
+ "data": {
1105
+ "text/plain": [
1106
+ "count 45461\n",
1107
+ "unique 44176\n",
1108
+ "top 0.0\n",
1109
+ "freq 34\n",
1110
+ "Name: popularity, dtype: object"
1111
+ ]
1112
+ },
1113
+ "execution_count": 23,
1114
+ "metadata": {},
1115
+ "output_type": "execute_result"
1116
+ }
1117
+ ],
1118
+ "source": [
1119
+ "df['popularity'].describe()"
1120
+ ]
1121
+ },
1122
+ {
1123
+ "cell_type": "code",
1124
+ "execution_count": 24,
1125
+ "id": "e693550c-9eed-47ed-8de9-402919a59228",
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "data": {
1130
+ "text/html": [
1131
+ "<div>\n",
1132
+ "<style scoped>\n",
1133
+ " .dataframe tbody tr th:only-of-type {\n",
1134
+ " vertical-align: middle;\n",
1135
+ " }\n",
1136
+ "\n",
1137
+ " .dataframe tbody tr th {\n",
1138
+ " vertical-align: top;\n",
1139
+ " }\n",
1140
+ "\n",
1141
+ " .dataframe thead th {\n",
1142
+ " text-align: right;\n",
1143
+ " }\n",
1144
+ "</style>\n",
1145
+ "<table border=\"1\" class=\"dataframe\">\n",
1146
+ " <thead>\n",
1147
+ " <tr style=\"text-align: right;\">\n",
1148
+ " <th></th>\n",
1149
+ " <th>revenue</th>\n",
1150
+ " <th>runtime</th>\n",
1151
+ " </tr>\n",
1152
+ " </thead>\n",
1153
+ " <tbody>\n",
1154
+ " <tr>\n",
1155
+ " <th>count</th>\n",
1156
+ " <td>4.546000e+04</td>\n",
1157
+ " <td>45203.000000</td>\n",
1158
+ " </tr>\n",
1159
+ " <tr>\n",
1160
+ " <th>mean</th>\n",
1161
+ " <td>1.120935e+07</td>\n",
1162
+ " <td>94.128199</td>\n",
1163
+ " </tr>\n",
1164
+ " <tr>\n",
1165
+ " <th>std</th>\n",
1166
+ " <td>6.433225e+07</td>\n",
1167
+ " <td>38.407810</td>\n",
1168
+ " </tr>\n",
1169
+ " <tr>\n",
1170
+ " <th>min</th>\n",
1171
+ " <td>0.000000e+00</td>\n",
1172
+ " <td>0.000000</td>\n",
1173
+ " </tr>\n",
1174
+ " <tr>\n",
1175
+ " <th>25%</th>\n",
1176
+ " <td>0.000000e+00</td>\n",
1177
+ " <td>85.000000</td>\n",
1178
+ " </tr>\n",
1179
+ " <tr>\n",
1180
+ " <th>50%</th>\n",
1181
+ " <td>0.000000e+00</td>\n",
1182
+ " <td>95.000000</td>\n",
1183
+ " </tr>\n",
1184
+ " <tr>\n",
1185
+ " <th>75%</th>\n",
1186
+ " <td>0.000000e+00</td>\n",
1187
+ " <td>107.000000</td>\n",
1188
+ " </tr>\n",
1189
+ " <tr>\n",
1190
+ " <th>max</th>\n",
1191
+ " <td>2.787965e+09</td>\n",
1192
+ " <td>1256.000000</td>\n",
1193
+ " </tr>\n",
1194
+ " </tbody>\n",
1195
+ "</table>\n",
1196
+ "</div>"
1197
+ ],
1198
+ "text/plain": [
1199
+ " revenue runtime\n",
1200
+ "count 4.546000e+04 45203.000000\n",
1201
+ "mean 1.120935e+07 94.128199\n",
1202
+ "std 6.433225e+07 38.407810\n",
1203
+ "min 0.000000e+00 0.000000\n",
1204
+ "25% 0.000000e+00 85.000000\n",
1205
+ "50% 0.000000e+00 95.000000\n",
1206
+ "75% 0.000000e+00 107.000000\n",
1207
+ "max 2.787965e+09 1256.000000"
1208
+ ]
1209
+ },
1210
+ "execution_count": 24,
1211
+ "metadata": {},
1212
+ "output_type": "execute_result"
1213
+ }
1214
+ ],
1215
+ "source": [
1216
+ "df.describe()"
1217
+ ]
1218
+ },
1219
+ {
1220
+ "cell_type": "code",
1221
+ "execution_count": null,
1222
+ "id": "9ae3ea42-2343-4f8a-9740-adf7a11d7d5f",
1223
+ "metadata": {},
1224
+ "outputs": [],
1225
+ "source": [
1226
+ "df.drop("
1227
+ ]
1228
+ }
1229
+ ],
1230
+ "metadata": {
1231
+ "kernelspec": {
1232
+ "display_name": "Python 3 (ipykernel)",
1233
+ "language": "python",
1234
+ "name": "python3"
1235
+ },
1236
+ "language_info": {
1237
+ "codemirror_mode": {
1238
+ "name": "ipython",
1239
+ "version": 3
1240
+ },
1241
+ "file_extension": ".py",
1242
+ "mimetype": "text/x-python",
1243
+ "name": "python",
1244
+ "nbconvert_exporter": "python",
1245
+ "pygments_lexer": "ipython3",
1246
+ "version": "3.11.14"
1247
+ }
1248
+ },
1249
+ "nbformat": 4,
1250
+ "nbformat_minor": 5
1251
+ }
main.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.preprocessing import parse_features
2
+ from src.recommender import MovieRecommender
3
+ import time
4
+
5
+ # 1. Setup paths
6
+ # Make sure your csv is exactly at this path
7
+ csv_path = 'data/movies_metadata.csv'
8
+
9
+ # 2. Run Preprocessing
10
+ # This handles the 45k rows and fixes the bad ID bug
11
+ df = parse_features(csv_path)
12
+
13
+ # 3. Initialize Recommender
14
+ # Using the CPU-optimized class we discussed
15
+ rec = MovieRecommender()
16
+
17
+ # 4. Generate Embeddings
18
+ # This is the heavy step. Go grab a coffee.
19
+ start_time = time.time()
20
+ rec.preprocess_and_embed(df)
21
+ print(f"Total time taken: {(time.time() - start_time)/60:.2f} minutes")
22
+
23
+ # 5. Save the result so you don't have to wait next time
24
+ rec.save('models/')
25
+
26
+ # 6. Quick Test
27
+ print("\n--- Test Recommendation ---")
28
+ print(rec.recommend_by_movie("Jumanji"))
models/metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78472dfc4fcf4703b633dc55e20c3ac24dcb627e0ddef239d63d89314cf56716
3
+ size 19236982
models/movie_index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f489fa4ce9d3a757267af4dacb3b9d3f7339bb9ebf798e0403e58fe14d2171a8
3
+ size 69970989
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas>=1.3.0
2
+ numpy>=1.21.0
3
+ requests>=2.26.0
4
+ sentence-transformers>=2.2.0
5
+ faiss-cpu>=1.7.2
src/__init__.py ADDED
File without changes
src/__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (3.13 kB). View file
 
src/__pycache__/preprocessing.cpython-312.pyc ADDED
Binary file (2.63 kB). View file
 
src/__pycache__/recommender.cpython-311.pyc ADDED
Binary file (8.52 kB). View file
 
src/ingest.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import requests
4
+ import pandas as pd
5
+ from src.recommender import MovieRecommender
6
+
7
+ load_dotenv()
8
+
9
+ # CONFIGURATION
10
+ API_KEY = os.getenv('API_KEY')
11
+ BASE_URL = "https://api.themoviedb.org/3"
12
+
13
+ def get_genre_map():
14
+ """
15
+ Fetches the official list of Genre IDs -> Names.
16
+ Example: {28: 'Action', 12: 'Adventure'}
17
+ """
18
+ url = f"{BASE_URL}/genre/movie/list"
19
+ params = {'api_key': API_KEY, 'language': 'en-US'}
20
+
21
+ try:
22
+ response = requests.get(url, params=params)
23
+ data = response.json()
24
+ # Turn list of dicts into a single lookup dictionary
25
+ mapping = {g['id']: g['name'] for g in data['genres']}
26
+ print("Genre map loaded.")
27
+ return mapping
28
+ except Exception as e:
29
+ print(f"Failed to load genres: {e}")
30
+ return {}
31
+
32
+ def get_popular_movies(limit=100):
33
+ """
34
+ Fetches the current most popular movies.
35
+ """
36
+ movies = []
37
+ pages = (limit // 20) + 1
38
+
39
+ print(f"Fetching top {limit} popular movies...")
40
+
41
+ for page in range(1, pages + 1):
42
+ url = f"{BASE_URL}/movie/popular"
43
+ params = {
44
+ 'api_key': API_KEY,
45
+ 'language': 'en-US',
46
+ 'page': page
47
+ }
48
+
49
+ try:
50
+ response = requests.get(url, params=params)
51
+ if response.status_code == 200:
52
+ results = response.json().get('results', [])
53
+ movies.extend(results)
54
+ else:
55
+ print(f"Error on page {page}: {response.status_code}")
56
+ except Exception as e:
57
+ print(f"Connection failed: {e}")
58
+
59
+ if len(movies) >= limit:
60
+ break
61
+
62
+ return movies[:limit]
63
+
64
+ def run_weekly_update():
65
+ # 1. Load the Brain
66
+ print("Loading existing database...")
67
+ rec = MovieRecommender()
68
+ try:
69
+ rec.load('models/')
70
+ existing_ids = set(rec.movies['id'].values)
71
+ print(f"Database currently holds {len(existing_ids)} movies.")
72
+ except:
73
+ print("No existing model found. Cannot update empty model.")
74
+ return
75
+
76
+ # 2. Get Data
77
+ candidates = get_popular_movies(limit=100)
78
+ genre_map = get_genre_map() # <--- NEW STEP
79
+
80
+ # 3. Filter Duplicates
81
+ new_movies_to_process = []
82
+ for m in candidates:
83
+ if m['id'] not in existing_ids:
84
+ new_movies_to_process.append(m)
85
+
86
+ count = len(new_movies_to_process)
87
+ print(f"New additions found: {count}")
88
+
89
+ if count == 0:
90
+ return
91
+
92
+ # 4. Compute & Ingest
93
+ print(f"Processing {count} new movies...")
94
+
95
+ for m in new_movies_to_process:
96
+ # Resolve Genre IDs to Names
97
+ # m['genre_ids'] might look like [28, 12]
98
+ # We turn that into "Action Adventure"
99
+ current_genres = [genre_map.get(gid, '') for gid in m.get('genre_ids', [])]
100
+ genre_str = " ".join(current_genres)
101
+
102
+ # Build the Soup with Tags
103
+ # Structure: Title + Title + Genres + Overview
104
+ soup = f"{m['title']} {m['title']} {genre_str} {m.get('overview', '')}"
105
+
106
+ movie_data = {
107
+ 'id': m['id'],
108
+ 'title': m['title'],
109
+ 'soup': soup
110
+ }
111
+
112
+ rec.add_new_movie(movie_data)
113
+ print(f" - Added: {m['title']} ({genre_str})")
114
+
115
+ # 5. Save
116
+ print("Saving updated index to disk...")
117
+ rec.save('models/')
118
+ print("Update Complete!")
119
+
120
+ if __name__ == "__main__":
121
+ run_weekly_update()
src/preprocessing.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import ast
3
+ import numpy as np
4
+
5
+ def clean_data(x):
6
+ """
7
+ Helper to convert stringified lists like "[{'name': 'Action'}]"
8
+ into a simple string "Action"
9
+ """
10
+ if isinstance(x, str):
11
+ try:
12
+ # Safely evaluate the string as a Python list/dict
13
+ item_list = ast.literal_eval(x)
14
+ if isinstance(item_list, list):
15
+ # Extract the 'name' key from each dict in the list
16
+ return ' '.join([i['name'] for i in item_list if 'name' in i])
17
+ except (ValueError, SyntaxError):
18
+ return ""
19
+ return ""
20
+
21
+ def parse_features(data_path):
22
+ print(f"Loading data from {data_path}...")
23
+
24
+ # 1. Load Data
25
+ # 'on_bad_lines' skips the few corrupted rows in this specific dataset
26
+ df = pd.read_csv(data_path, low_memory=False)
27
+
28
+ # 2. Filter Bad IDs
29
+ # This dataset has a known bug where some IDs are dates (e.g., '1995-10-20')
30
+ # We force 'id' to numeric and drop rows that fail
31
+ df['id'] = pd.to_numeric(df['id'], errors='coerce')
32
+ df = df.dropna(subset=['id'])
33
+ df['id'] = df['id'].astype(int)
34
+
35
+ # 3. Fill NaNs
36
+ df['title'] = df['title'].fillna('')
37
+ df['overview'] = df['overview'].fillna('')
38
+ df['tagline'] = df['tagline'].fillna('')
39
+ df['genres'] = df['genres'].fillna('[]')
40
+
41
+ print("Parsing genres (this might take a moment)...")
42
+ # 4. Clean Genres
43
+ df['genre_names'] = df['genres'].apply(clean_data)
44
+
45
+ # 5. Create the "Soup"
46
+ # We combine Title (2x weight), Tagline, Overview, and Genres
47
+ def create_soup(x):
48
+ return f"{x['title']} {x['title']} {x['tagline']} {x['overview']} {x['genre_names']}"
49
+
50
+ df['soup'] = df.apply(create_soup, axis=1)
51
+
52
+ # 6. Return only what we need to save memory
53
+ final_df = df[['id', 'title', 'soup']].reset_index(drop=True)
54
+
55
+ print(f"Cleaned data: {len(final_df)} movies ready for embedding.")
56
+ return final_df
src/recommender.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import faiss
4
+ from sentence_transformers import SentenceTransformer
5
+ import pickle
6
+ import os
7
+
8
+ class MovieRecommender:
9
+ def __init__(self, model_name='all-MiniLM-L6-v2'):
10
+ # Load the Transformer model
11
+ self.encoder = SentenceTransformer(model_name)
12
+ # Dimension of the embedding (384 for MiniLM)
13
+ self.d = 384
14
+ self.index = None
15
+ self.movies = pd.DataFrame()
16
+ self.id_map = {} # Maps FAISS index ID to Movie ID
17
+
18
+ def preprocess_and_embed(self, df):
19
+ self.movies = df.reset_index(drop=True)
20
+
21
+ # CPU OPTIMIZATION: Process in batches
22
+ # This prevents RAM spikes and shows you a progress bar
23
+ batch_size = 64
24
+ print(f"Generating embeddings on CPU in batches of {batch_size}...")
25
+
26
+ # sentence-transformers handles batching internally,
27
+ # but setting explicit batch_size helps on CPU
28
+ embeddings = self.encoder.encode(
29
+ self.movies['soup'].tolist(),
30
+ batch_size=batch_size,
31
+ show_progress_bar=True,
32
+ convert_to_numpy=True
33
+ )
34
+
35
+ print("Normalizing vectors...")
36
+ faiss.normalize_L2(embeddings)
37
+
38
+ print("Building Index...")
39
+ # IndexFlatIP is brute-force but highly optimized for CPU.
40
+ # It relies on BLAS libraries (MKL/OpenBLAS) which your CPU uses naturally.
41
+ self.index = faiss.IndexFlatIP(self.d)
42
+ self.index.add(embeddings)
43
+
44
+ self.id_map = {i: row['id'] for i, row in self.movies.iterrows()}
45
+ print(f"Done. Indexed {self.index.ntotal} movies.")
46
+
47
+ def search(self, query_vector, k=5):
48
+ """
49
+ Low-level search: Input vector -> Top K Movie IDs
50
+ """
51
+ # Ensure vector is 2D (1, d) and normalized
52
+ query_vector = query_vector.reshape(1, -1)
53
+ faiss.normalize_L2(query_vector)
54
+
55
+ # Search
56
+ distances, indices = self.index.search(query_vector, k)
57
+
58
+ results = []
59
+ for i, idx in enumerate(indices[0]):
60
+ if idx != -1: # FAISS returns -1 if not found
61
+ movie_id = self.id_map[idx]
62
+ results.append({
63
+ "movie_id": int(movie_id),
64
+ "score": float(distances[0][i]),
65
+ "title": self.movies[self.movies['id'] == movie_id]['title'].values[0]
66
+ })
67
+ return results
68
+
69
+ def recommend_by_movie(self, movie_title, k=5):
70
+ """
71
+ Find movies similar to a specific movie title already in DB.
72
+ """
73
+ # Find the movie's vector
74
+ movie_row = self.movies[self.movies['title'].str.contains(movie_title, case=False)]
75
+ if movie_row.empty:
76
+ return "Movie not found."
77
+
78
+ # Re-encode is safer to ensure we have the exact vector logic,
79
+ # or we could cache the vectors in the dataframe to save time.
80
+ # Here we re-encode for simplicity.
81
+ vec = self.encoder.encode([movie_row.iloc[0]['soup']])
82
+ return self.search(vec, k)
83
+
84
+ def recommend_on_text(self, text_query, k=5):
85
+ """
86
+ Recommends movies based on a raw text description.
87
+ Example: "A romantic movie about a sinking ship" -> Titanic
88
+ """
89
+ print(f"Searching for: '{text_query}'...")
90
+
91
+ # 1. Encode the user's text into a vector
92
+ # This runs on the CPU in milliseconds because it's just one sentence.
93
+ query_vector = self.encoder.encode([text_query])
94
+
95
+ # 2. Normalize (Important for Cosine Similarity)
96
+ faiss.normalize_L2(query_vector)
97
+
98
+ # 3. Search the existing index
99
+ # We reuse the same search logic we defined earlier
100
+ return self.search(query_vector, k)
101
+
102
+ def recommend_for_user(self, liked_movie_titles, k=5):
103
+ """
104
+ The Personalized Logic.
105
+ 1. Get vectors for all movies the user liked.
106
+ 2. Average them to create a 'User Profile Vector'.
107
+ 3. Search against the index.
108
+ """
109
+ vectors = []
110
+ for title in liked_movie_titles:
111
+ # Find movie in our DB
112
+ movie_row = self.movies[self.movies['title'].str.contains(title, case=False)]
113
+ if not movie_row.empty:
114
+ soup = movie_row.iloc[0]['soup']
115
+ vectors.append(self.encoder.encode(soup))
116
+
117
+ if not vectors:
118
+ return "No known movies provided."
119
+
120
+ # Average the vectors (User Profile)
121
+ user_vector = np.mean(vectors, axis=0)
122
+ return self.search(user_vector, k)
123
+
124
+ def add_new_movie(self, movie_dict):
125
+ """
126
+ Incremental Learning: Add a movie without retraining.
127
+ movie_dict: {'id': 123, 'title': 'New Movie', 'soup': 'description...'}
128
+ """
129
+ # 1. Vectorize the new soup
130
+ new_vec = self.encoder.encode([movie_dict['soup']])
131
+ faiss.normalize_L2(new_vec)
132
+
133
+ # 2. Add to FAISS Index (Instant)
134
+ self.index.add(new_vec)
135
+
136
+ # 3. Update Metadata DataFrame
137
+ # We append the new row to the pandas dataframe
138
+ new_row = pd.DataFrame([movie_dict])
139
+ self.movies = pd.concat([self.movies, new_row], ignore_index=True)
140
+
141
+ # 4. Update ID Map
142
+ # The new index ID is the last position in the dataframe
143
+ next_idx = len(self.movies) - 1
144
+ self.id_map[next_idx] = movie_dict['id']
145
+
146
+ def save(self, path='models/'):
147
+ """Persistence"""
148
+ os.makedirs(path, exist_ok=True)
149
+ faiss.write_index(self.index, os.path.join(path, 'movie_index.faiss'))
150
+ self.movies.to_pickle(os.path.join(path, 'metadata.pkl'))
151
+ # Note: We don't save the encoder, we load it fresh every time.
152
+
153
+ def load(self, path='models/'):
154
+ self.index = faiss.read_index(os.path.join(path, 'movie_index.faiss'))
155
+ self.movies = pd.read_pickle(os.path.join(path, 'metadata.pkl'))
156
+ self.id_map = {i: row['id'] for i, row in self.movies.iterrows()}
test.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.recommender import MovieRecommender
2
+
3
+ rec = MovieRecommender()
4
+ rec.load('models/')
5
+
6
+
7
+ # Assuming 'rec' is your loaded MovieRecommender instance
8
+
9
+ # Example 1: Vague description
10
+ print(rec.recommend_on_text("running man"))
11
+
12
+
13
+ # # Example 2: Specific vibe
14
+ # print(rec.recommend_on_text("mind bending sci-fi about dreams within dreams"))
15
+ # # Expected: Inception
16
+
17
+ # # Example 3: Emotional query
18
+ # print(rec.recommend_on_text("sad movie that makes me cry about a dog"))
19
+ # # Expected: Hachi: A Dog's Tale or Marley & Me