Commit ยท
3407011
1
Parent(s): 93d63a5
changed readme file
Browse files- README.md +608 -8
- src/__pycache__/recommender.cpython-311.pyc +0 -0
- src/ingest.py +1 -1
- src/recommender.py +1 -1
- test.py +1 -1
- test2.py +29 -0
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: ๐ฌ
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: pink
|
|
@@ -7,12 +7,612 @@ sdk: docker
|
|
| 7 |
app_port: 7860
|
| 8 |
---
|
| 9 |
|
| 10 |
-
# CineMatch API
|
| 11 |
|
| 12 |
-
|
| 13 |
-
It runs a FastAPI server using FAISS and SentenceTransformers.
|
| 14 |
|
| 15 |
-
##
|
| 16 |
-
|
| 17 |
-
-
|
| 18 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: CineMatch API
|
| 3 |
emoji: ๐ฌ
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: pink
|
|
|
|
| 7 |
app_port: 7860
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# ๐ฌ CineMatch API
|
| 11 |
|
| 12 |
+
**CineMatch** is an intelligent, content-based movie recommendation engine powered by cutting-edge AI. It combines semantic search, vector embeddings, and personalization to deliver highly accurate movie recommendations tailored to user preferences.
|
|
|
|
| 13 |
|
| 14 |
+
## ๐ Table of Contents
|
| 15 |
+
|
| 16 |
+
- [Features](#features)
|
| 17 |
+
- [Architecture](#architecture)
|
| 18 |
+
- [Tech Stack](#tech-stack)
|
| 19 |
+
- [Installation](#installation)
|
| 20 |
+
- [Configuration](#configuration)
|
| 21 |
+
- [Usage](#usage)
|
| 22 |
+
- [Running the Server](#running-the-server)
|
| 23 |
+
- [API Endpoints](#api-endpoints)
|
| 24 |
+
- [Examples](#examples)
|
| 25 |
+
- [Project Structure](#project-structure)
|
| 26 |
+
- [How It Works](#how-it-works)
|
| 27 |
+
- [Performance Considerations](#performance-considerations)
|
| 28 |
+
- [Deployment](#deployment)
|
| 29 |
+
- [Troubleshooting](#troubleshooting)
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## โจ Features
|
| 34 |
+
|
| 35 |
+
### 1. **Semantic Search** ๐
|
| 36 |
+
Search for movies using natural language queries. The system converts your text into a vector embedding and finds semantically similar movies.
|
| 37 |
+
- Example: *"A romantic movie about a sinking ship"* โ Returns *Titanic*
|
| 38 |
+
|
| 39 |
+
### 2. **Vibe-Based Recommendations** ๐ฏ
|
| 40 |
+
Search by combining tags (genres, themes) and descriptions for more refined results.
|
| 41 |
+
- Example: Tags: `["Sci-Fi", "Action"]`, Description: `"Robots fighting in space"` โ Returns relevant matches
|
| 42 |
+
|
| 43 |
+
### 3. **Personalized Recommendations** ๐ค
|
| 44 |
+
Provide a list of movies you've liked, and the system averages their vectors to create a personalized profile, then recommends similar movies.
|
| 45 |
+
- Example: Liked: `["The Matrix", "Inception"]` โ Get similar mind-bending films
|
| 46 |
+
|
| 47 |
+
### 4. **Content-Based Similarity** ๐
|
| 48 |
+
Find movies similar to a specific title already in the database.
|
| 49 |
+
- Example: Similar to *"Inception"* โ Returns *"Interstellar"*, *"The Matrix"*, etc.
|
| 50 |
+
|
| 51 |
+
### 5. **Rich Movie Metadata** ๐
|
| 52 |
+
Each movie includes:
|
| 53 |
+
- Director information
|
| 54 |
+
- Top 4 cast members
|
| 55 |
+
- Keywords (e.g., "time travel", "dystopia")
|
| 56 |
+
- Genres
|
| 57 |
+
- Plot overview
|
| 58 |
+
- IMDB ratings
|
| 59 |
+
|
| 60 |
+
### 6. **Incremental Learning** ๐
|
| 61 |
+
Add new movies to the system without retrainingโupdates are instant!
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## ๐๏ธ Architecture
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
โโโโโโโโโโโโโโโโโโโ
|
| 69 |
+
โ User Request โ
|
| 70 |
+
โโโโโโโโโโฌโโโโโโโโโ
|
| 71 |
+
โ
|
| 72 |
+
โโโโโโผโโโโโโโโโโโโโโโโโโโโโโ
|
| 73 |
+
โ FastAPI Server โ
|
| 74 |
+
โ (Endpoint Handler) โ
|
| 75 |
+
โโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโ
|
| 76 |
+
โ
|
| 77 |
+
โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 78 |
+
โ MovieRecommender Engine โ
|
| 79 |
+
โ (FAISS + Vector Search) โ
|
| 80 |
+
โโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 81 |
+
โ
|
| 82 |
+
โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโ
|
| 83 |
+
โ Embedding Model โ
|
| 84 |
+
โ (SentenceTransformers) โ
|
| 85 |
+
โโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโ
|
| 86 |
+
โ
|
| 87 |
+
โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโ
|
| 88 |
+
โ FAISS Index โ
|
| 89 |
+
โ (movie_index.faiss) โ
|
| 90 |
+
โโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโ
|
| 91 |
+
โ
|
| 92 |
+
โโโโโโผโโโโโโโโโโโโโโโโโโโโโโโ
|
| 93 |
+
โ Movie Metadata โ
|
| 94 |
+
โ (metadata.pkl) โ
|
| 95 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
---
|
| 99 |
+
|
| 100 |
+
## ๐ ๏ธ Tech Stack
|
| 101 |
+
|
| 102 |
+
| Component | Technology | Purpose |
|
| 103 |
+
|-----------|-----------|---------|
|
| 104 |
+
| **Backend Framework** | FastAPI | High-performance async API |
|
| 105 |
+
| **Vector Search** | FAISS | Fast similarity search on embeddings |
|
| 106 |
+
| **Embeddings** | SentenceTransformers (MiniLM-L6-v2) | Convert text to 384-dim vectors |
|
| 107 |
+
| **Data Source** | TMDB API | Movie metadata (titles, cast, genres, etc.) |
|
| 108 |
+
| **Data Processing** | Pandas, NumPy | Data cleaning & preprocessing |
|
| 109 |
+
| **Deployment** | Docker | Containerized deployment |
|
| 110 |
+
| **Python Version** | 3.9+ | Modern async/await support |
|
| 111 |
+
|
| 112 |
+
---
|
| 113 |
+
|
| 114 |
+
## ๐ฆ Installation
|
| 115 |
+
|
| 116 |
+
### Prerequisites
|
| 117 |
+
- Python 3.9 or higher
|
| 118 |
+
- TMDB API Key (free, get it at [themoviedb.org](https://www.themoviedb.org/settings/api))
|
| 119 |
+
- ~2GB free disk space (for models and indices)
|
| 120 |
+
|
| 121 |
+
### Step 1: Clone & Setup
|
| 122 |
+
|
| 123 |
+
```bash
|
| 124 |
+
# Navigate to project directory
|
| 125 |
+
cd CineMatch
|
| 126 |
+
|
| 127 |
+
# Create virtual environment
|
| 128 |
+
python -m venv .venv
|
| 129 |
+
|
| 130 |
+
# Activate virtual environment
|
| 131 |
+
# On Windows:
|
| 132 |
+
.\.venv\Scripts\Activate.ps1
|
| 133 |
+
|
| 134 |
+
# On macOS/Linux:
|
| 135 |
+
source .venv/bin/activate
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
### Step 2: Install Dependencies
|
| 139 |
+
|
| 140 |
+
```bash
|
| 141 |
+
pip install -r requirements.txt
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### Step 3: Configure Environment
|
| 145 |
+
|
| 146 |
+
Create a `.env` file in the project root:
|
| 147 |
+
|
| 148 |
+
```
|
| 149 |
+
TMDB_API_KEY=your_api_key_here
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## โ๏ธ Configuration
|
| 155 |
+
|
| 156 |
+
### Environment Variables
|
| 157 |
+
|
| 158 |
+
| Variable | Description | Example |
|
| 159 |
+
|----------|-------------|---------|
|
| 160 |
+
| `TMDB_API_KEY` | Your TMDB API key | `abc123xyz...` |
|
| 161 |
+
|
| 162 |
+
### Model Configuration
|
| 163 |
+
|
| 164 |
+
The default embedding model is **`all-MiniLM-L6-v2`** from SentenceTransformers:
|
| 165 |
+
- **Embedding Dimension**: 384
|
| 166 |
+
- **Speed**: Very fast (optimized for CPU)
|
| 167 |
+
- **Quality**: High for semantic similarity
|
| 168 |
+
- **Memory**: ~80MB
|
| 169 |
+
|
| 170 |
+
To use a different model, modify [recommender.py](src/recommender.py#L6) in the `MovieRecommender.__init__()` method.
|
| 171 |
+
|
| 172 |
+
---
|
| 173 |
+
|
| 174 |
+
## ๐ Usage
|
| 175 |
+
|
| 176 |
+
### Running the Server
|
| 177 |
+
|
| 178 |
+
```bash
|
| 179 |
+
# Make sure your virtual environment is activated
|
| 180 |
+
python app.py
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
The server will start at `http://localhost:8000`
|
| 184 |
+
|
| 185 |
+
**API Documentation** (auto-generated Swagger UI):
|
| 186 |
+
- Swagger: `http://localhost:8000/docs`
|
| 187 |
+
- ReDoc: `http://localhost:8000/redoc`
|
| 188 |
+
|
| 189 |
+
### Data Ingestion
|
| 190 |
+
|
| 191 |
+
Before using the API, you need to populate the FAISS index with movies:
|
| 192 |
+
|
| 193 |
+
```bash
|
| 194 |
+
python src/ingest.py
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
This will:
|
| 198 |
+
1. Fetch ~50 high-quality movies from TMDB (popularity โฅ 7.0, votes โฅ 500)
|
| 199 |
+
2. Extract director, cast, and keywords for each movie
|
| 200 |
+
3. Generate embeddings
|
| 201 |
+
4. Save to `models/movie_index.faiss` and `models/metadata.pkl`
|
| 202 |
+
|
| 203 |
+
To reset and rebuild the index:
|
| 204 |
+
|
| 205 |
+
```python
|
| 206 |
+
# In src/ingest.py, modify the last line:
|
| 207 |
+
ingest_high_quality_movies(target_count=100, reset=True) # reset=True to rebuild
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
## ๐ก API Endpoints
|
| 213 |
+
|
| 214 |
+
### 1. **Health Check**
|
| 215 |
+
```
|
| 216 |
+
GET /
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
**Response:**
|
| 220 |
+
```json
|
| 221 |
+
{
|
| 222 |
+
"status": "online and active!!!",
|
| 223 |
+
"model_loaded": true
|
| 224 |
+
}
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
---
|
| 228 |
+
|
| 229 |
+
### 2. **Semantic Search** ๐
|
| 230 |
+
```
|
| 231 |
+
POST /search
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
**Request:**
|
| 235 |
+
```json
|
| 236 |
+
{
|
| 237 |
+
"query": "A romantic movie about a sinking ship",
|
| 238 |
+
"k": 5
|
| 239 |
+
}
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
**Response:**
|
| 243 |
+
```json
|
| 244 |
+
[
|
| 245 |
+
{
|
| 246 |
+
"movie_id": 597,
|
| 247 |
+
"title": "Titanic",
|
| 248 |
+
"score": 0.856
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"movie_id": 285,
|
| 252 |
+
"title": "The Poseidon Adventure",
|
| 253 |
+
"score": 0.743
|
| 254 |
+
}
|
| 255 |
+
]
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
---
|
| 259 |
+
|
| 260 |
+
### 3. **Vibe-Based Search** ๐ฏ
|
| 261 |
+
```
|
| 262 |
+
POST /recommend/vibe
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
**Request:**
|
| 266 |
+
```json
|
| 267 |
+
{
|
| 268 |
+
"tags": ["Sci-Fi", "Action", "Space"],
|
| 269 |
+
"description": "Robots fighting in space with stunning visuals",
|
| 270 |
+
"k": 10
|
| 271 |
+
}
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
**Response:**
|
| 275 |
+
```json
|
| 276 |
+
{
|
| 277 |
+
"interpreted_query": "Sci-Fi Action Space Sci-Fi Action Space Robots fighting in space with stunning visuals",
|
| 278 |
+
"results": [
|
| 279 |
+
{
|
| 280 |
+
"movie_id": 58,
|
| 281 |
+
"title": "The Fifth Element",
|
| 282 |
+
"score": 0.912
|
| 283 |
+
}
|
| 284 |
+
]
|
| 285 |
+
}
|
| 286 |
+
```
|
| 287 |
+
|
| 288 |
+
---
|
| 289 |
+
|
| 290 |
+
### 4. **Personalized Recommendations** ๐ค
|
| 291 |
+
```
|
| 292 |
+
POST /recommend/user
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
**Request:**
|
| 296 |
+
```json
|
| 297 |
+
{
|
| 298 |
+
"liked_movies": ["The Matrix", "Inception", "Interstellar"],
|
| 299 |
+
"k": 5
|
| 300 |
+
}
|
| 301 |
+
```
|
| 302 |
+
|
| 303 |
+
**Response:**
|
| 304 |
+
```json
|
| 305 |
+
[
|
| 306 |
+
{
|
| 307 |
+
"movie_id": 27205,
|
| 308 |
+
"title": "Oblivion",
|
| 309 |
+
"score": 0.834
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"movie_id": 284054,
|
| 313 |
+
"title": "Doctor Strange",
|
| 314 |
+
"score": 0.798
|
| 315 |
+
}
|
| 316 |
+
]
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
---
|
| 320 |
+
|
| 321 |
+
### 5. **Similar Movies** ๐
|
| 322 |
+
```
|
| 323 |
+
GET /recommend/movie/{title}
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
**Example:**
|
| 327 |
+
```
|
| 328 |
+
GET /recommend/movie/Inception
|
| 329 |
+
```
|
| 330 |
+
|
| 331 |
+
**Response:**
|
| 332 |
+
```json
|
| 333 |
+
[
|
| 334 |
+
{
|
| 335 |
+
"movie_id": 38372,
|
| 336 |
+
"title": "Interstellar",
|
| 337 |
+
"score": 0.891
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"movie_id": 603,
|
| 341 |
+
"title": "The Matrix",
|
| 342 |
+
"score": 0.867
|
| 343 |
+
}
|
| 344 |
+
]
|
| 345 |
+
```
|
| 346 |
+
|
| 347 |
+
---
|
| 348 |
+
|
| 349 |
+
### 6. **Admin: Trigger Background Update** ๐
|
| 350 |
+
```
|
| 351 |
+
POST /admin/trigger-update
|
| 352 |
+
```
|
| 353 |
+
|
| 354 |
+
**Response:**
|
| 355 |
+
```json
|
| 356 |
+
{
|
| 357 |
+
"message": "Update process started in background. Check server logs for progress."
|
| 358 |
+
}
|
| 359 |
+
```
|
| 360 |
+
|
| 361 |
+
This endpoint triggers background ingestion without blocking the API.
|
| 362 |
+
|
| 363 |
+
---
|
| 364 |
+
|
| 365 |
+
## ๐ Examples
|
| 366 |
+
|
| 367 |
+
### Example 1: Find Movies Similar to Your Favorite
|
| 368 |
+
|
| 369 |
+
```python
|
| 370 |
+
import requests
|
| 371 |
+
|
| 372 |
+
BASE_URL = "http://localhost:8000"
|
| 373 |
+
|
| 374 |
+
# Get movies similar to "The Matrix"
|
| 375 |
+
response = requests.get(f"{BASE_URL}/recommend/movie/The Matrix")
|
| 376 |
+
recommendations = response.json()
|
| 377 |
+
|
| 378 |
+
for movie in recommendations:
|
| 379 |
+
print(f"{movie['title']} (Score: {movie['score']:.2f})")
|
| 380 |
+
```
|
| 381 |
+
|
| 382 |
+
### Example 2: Semantic Search with Natural Language
|
| 383 |
+
|
| 384 |
+
```python
|
| 385 |
+
response = requests.post(
|
| 386 |
+
f"{BASE_URL}/search",
|
| 387 |
+
json={
|
| 388 |
+
"query": "A thrilling space adventure with amazing visuals",
|
| 389 |
+
"k": 5
|
| 390 |
+
}
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
for movie in response.json():
|
| 394 |
+
print(f"โ {movie['title']}")
|
| 395 |
+
```
|
| 396 |
+
|
| 397 |
+
### Example 3: Personalized Recommendations Based on History
|
| 398 |
+
|
| 399 |
+
```python
|
| 400 |
+
response = requests.post(
|
| 401 |
+
f"{BASE_URL}/recommend/user",
|
| 402 |
+
json={
|
| 403 |
+
"liked_movies": ["Dune", "Blade Runner 2049", "Arrival"],
|
| 404 |
+
"k": 10
|
| 405 |
+
}
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
for movie in response.json():
|
| 409 |
+
print(f"โ
{movie['title']}")
|
| 410 |
+
```
|
| 411 |
+
|
| 412 |
+
---
|
| 413 |
+
|
| 414 |
+
## ๐ Project Structure
|
| 415 |
+
|
| 416 |
+
```
|
| 417 |
+
CineMatch/
|
| 418 |
+
โโโ app.py # Main FastAPI application
|
| 419 |
+
โโโ main.py # (Optional) Alternative entry point
|
| 420 |
+
โโโ Dockerfile # Docker configuration
|
| 421 |
+
โโโ requirements.txt # Python dependencies
|
| 422 |
+
โโโ .env # API keys (create this)
|
| 423 |
+
โ
|
| 424 |
+
โโโ src/
|
| 425 |
+
โ โโโ __init__.py
|
| 426 |
+
โ โโโ recommender.py # Core FAISS-based recommendation engine
|
| 427 |
+
โ โโโ ingest.py # TMDB data ingestion pipeline
|
| 428 |
+
โ โโโ preprocessing.py # Data cleaning & feature engineering
|
| 429 |
+
โ
|
| 430 |
+
โโโ models/
|
| 431 |
+
โ โโโ movie_index.faiss # FAISS index (generated after ingestion)
|
| 432 |
+
โ โโโ metadata.pkl # Movie metadata dataframe (generated)
|
| 433 |
+
โ
|
| 434 |
+
โโโ eda/
|
| 435 |
+
โ โโโ Untitled.ipynb # Exploratory data analysis notebook
|
| 436 |
+
โ
|
| 437 |
+
โโโ README.md # This file
|
| 438 |
+
```
|
| 439 |
+
|
| 440 |
+
---
|
| 441 |
+
|
| 442 |
+
## ๐ง How It Works
|
| 443 |
+
|
| 444 |
+
### The Embedding Pipeline
|
| 445 |
+
|
| 446 |
+
```
|
| 447 |
+
Raw Text Input (Movie Title + Metadata)
|
| 448 |
+
โ
|
| 449 |
+
[SentenceTransformers]
|
| 450 |
+
โ
|
| 451 |
+
384-Dimensional Vector
|
| 452 |
+
โ
|
| 453 |
+
[L2 Normalization]
|
| 454 |
+
โ
|
| 455 |
+
Normalized Vector (Unit Length)
|
| 456 |
+
โ
|
| 457 |
+
[FAISS IndexFlatIP]
|
| 458 |
+
โ
|
| 459 |
+
Stored in Index
|
| 460 |
+
```
|
| 461 |
+
|
| 462 |
+
### Recommendation Flow
|
| 463 |
+
|
| 464 |
+
1. **User provides query** (text, tags, or movie titles)
|
| 465 |
+
2. **Convert to vector** using SentenceTransformers
|
| 466 |
+
3. **Normalize vector** (for cosine similarity)
|
| 467 |
+
4. **FAISS search** finds K nearest neighbors in index
|
| 468 |
+
5. **Return results** with similarity scores
|
| 469 |
+
|
| 470 |
+
### Why This Approach?
|
| 471 |
+
|
| 472 |
+
- **Fast**: FAISS is optimized for billion-scale vector search
|
| 473 |
+
- **Accurate**: Semantic embeddings capture meaning, not just keywords
|
| 474 |
+
- **Scalable**: Can handle millions of movies
|
| 475 |
+
- **CPU-Friendly**: MiniLM model is tiny but effective
|
| 476 |
+
- **Incremental**: Add movies without retraining
|
| 477 |
+
|
| 478 |
+
---
|
| 479 |
+
|
| 480 |
+
## โก Performance Considerations
|
| 481 |
+
|
| 482 |
+
### Indexing Speed
|
| 483 |
+
- **MiniLM Model**: ~100-200 movies/second on modern CPU
|
| 484 |
+
- **FAISS Indexing**: Instant for additions
|
| 485 |
+
- **Memory**: ~384 bytes per movie embedding
|
| 486 |
+
|
| 487 |
+
### Search Speed
|
| 488 |
+
- **Single Query**: 1-5ms
|
| 489 |
+
- **Batch Queries**: Linear time complexity O(n)
|
| 490 |
+
- **Max Practical Size**: 10+ million movies
|
| 491 |
+
|
| 492 |
+
### Optimization Tips
|
| 493 |
+
|
| 494 |
+
1. **Use Batch Processing**: Send multiple queries at once
|
| 495 |
+
2. **Tune k Parameter**: Lower k = faster results (typically k=5-10 is good)
|
| 496 |
+
3. **CPU**: The MiniLM model leverages BLAS libraries for speed
|
| 497 |
+
4. **GPU**: Optionalโcan speed up embedding generation 10x
|
| 498 |
+
|
| 499 |
+
---
|
| 500 |
+
|
| 501 |
+
## ๐ณ Deployment
|
| 502 |
+
|
| 503 |
+
### Docker Build & Run
|
| 504 |
+
|
| 505 |
+
```bash
|
| 506 |
+
# Build image
|
| 507 |
+
docker build -t cinematch:latest .
|
| 508 |
+
|
| 509 |
+
# Run container
|
| 510 |
+
docker run -p 8000:8000 \
|
| 511 |
+
-e TMDB_API_KEY=your_key \
|
| 512 |
+
cinematch:latest
|
| 513 |
+
```
|
| 514 |
+
|
| 515 |
+
### Production Deployment
|
| 516 |
+
|
| 517 |
+
The project includes a `Dockerfile` configured for production use:
|
| 518 |
+
- **Base Image**: Python 3.9+
|
| 519 |
+
- **Port**: 8000 (configurable)
|
| 520 |
+
- **Entry**: `python app.py`
|
| 521 |
+
|
| 522 |
+
For production, consider:
|
| 523 |
+
- Using **Gunicorn** or **Uvicorn** with multiple workers
|
| 524 |
+
- Adding **Nginx** reverse proxy
|
| 525 |
+
- Implementing **authentication** (API keys)
|
| 526 |
+
- Using **cloud storage** for models (S3, GCS)
|
| 527 |
+
|
| 528 |
+
---
|
| 529 |
+
|
| 530 |
+
## ๐ Troubleshooting
|
| 531 |
+
|
| 532 |
+
### Issue: "No model found" Error
|
| 533 |
+
|
| 534 |
+
**Solution**: Run data ingestion first:
|
| 535 |
+
```bash
|
| 536 |
+
python src/ingest.py
|
| 537 |
+
```
|
| 538 |
+
|
| 539 |
+
### Issue: TMDB API Key Invalid
|
| 540 |
+
|
| 541 |
+
**Solution**: Verify your `.env` file:
|
| 542 |
+
```bash
|
| 543 |
+
cat .env # Check the key is there
|
| 544 |
+
```
|
| 545 |
+
|
| 546 |
+
### Issue: Out of Memory
|
| 547 |
+
|
| 548 |
+
**Solution**: Reduce batch size in [recommender.py](src/recommender.py#L18):
|
| 549 |
+
```python
|
| 550 |
+
batch_size = 32 # Lower from 64
|
| 551 |
+
```
|
| 552 |
+
|
| 553 |
+
### Issue: Slow Embedding Generation
|
| 554 |
+
|
| 555 |
+
**Solution**:
|
| 556 |
+
- The MiniLM model is already optimized for CPU
|
| 557 |
+
- For GPU support, install PyTorch with CUDA:
|
| 558 |
+
```bash
|
| 559 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
| 560 |
+
```
|
| 561 |
+
|
| 562 |
+
### Issue: CORS Errors
|
| 563 |
+
|
| 564 |
+
**Solution**: Already handled in [app.py](app.py#L15). The API allows all origins (`allow_origins=["*"]`). For production, restrict this:
|
| 565 |
+
```python
|
| 566 |
+
allow_origins=["https://yourdomain.com"]
|
| 567 |
+
```
|
| 568 |
+
|
| 569 |
+
---
|
| 570 |
+
|
| 571 |
+
## ๐ Dataset Information
|
| 572 |
+
|
| 573 |
+
**Movie Source**: The Movie Database (TMDB) API
|
| 574 |
+
|
| 575 |
+
**Filtering Criteria**:
|
| 576 |
+
- Minimum Rating: 7.0 / 10.0
|
| 577 |
+
- Minimum Vote Count: 500 votes
|
| 578 |
+
- Sorted by: Popularity (descending)
|
| 579 |
+
|
| 580 |
+
**Metadata Included**:
|
| 581 |
+
- Title
|
| 582 |
+
- Director
|
| 583 |
+
- Cast (top 4 actors)
|
| 584 |
+
- Keywords
|
| 585 |
+
- Genres
|
| 586 |
+
- Overview / Plot
|
| 587 |
+
- Vote Average
|
| 588 |
+
|
| 589 |
+
---
|
| 590 |
+
|
| 591 |
+
## ๐ฎ Future Enhancements
|
| 592 |
+
|
| 593 |
+
- [ ] User authentication & API key management
|
| 594 |
+
- [ ] Collaborative filtering (user-user similarity)
|
| 595 |
+
- [ ] Real-time model updates with webhooks
|
| 596 |
+
- [ ] Advanced filtering (year, rating, runtime)
|
| 597 |
+
- [ ] Movie rating & feedback loop for model improvement
|
| 598 |
+
- [ ] Multi-language support
|
| 599 |
+
- [ ] Mobile app integration
|
| 600 |
+
|
| 601 |
+
---
|
| 602 |
+
|
| 603 |
+
## ๐ License
|
| 604 |
+
|
| 605 |
+
This project is open source. Feel free to modify and extend it!
|
| 606 |
+
|
| 607 |
+
---
|
| 608 |
+
|
| 609 |
+
## ๐ฌ Support
|
| 610 |
+
|
| 611 |
+
For issues, questions, or contributions:
|
| 612 |
+
1. Check the [Troubleshooting](#troubleshooting) section
|
| 613 |
+
2. Review the [API Documentation](http://localhost:8000/docs)
|
| 614 |
+
3. Examine the source code in `src/` directory
|
| 615 |
+
|
| 616 |
+
---
|
| 617 |
+
|
| 618 |
+
**Enjoy discovering your next favorite movie! ๐ฟ๐ฌ**
|
src/__pycache__/recommender.cpython-311.pyc
CHANGED
|
Binary files a/src/__pycache__/recommender.cpython-311.pyc and b/src/__pycache__/recommender.cpython-311.pyc differ
|
|
|
src/ingest.py
CHANGED
|
@@ -141,4 +141,4 @@ def ingest_high_quality_movies(target_count=50, reset=False):
|
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
# Reset=True ensures we rebuild the old movies with the NEW metadata
|
| 144 |
-
ingest_high_quality_movies(target_count=
|
|
|
|
| 141 |
|
| 142 |
if __name__ == "__main__":
|
| 143 |
# Reset=True ensures we rebuild the old movies with the NEW metadata
|
| 144 |
+
ingest_high_quality_movies(target_count=50, reset=False)
|
src/recommender.py
CHANGED
|
@@ -81,7 +81,7 @@ class MovieRecommender:
|
|
| 81 |
vec = self.encoder.encode([movie_row.iloc[0]['soup']])
|
| 82 |
return self.search(vec, k)
|
| 83 |
|
| 84 |
-
def recommend_on_text(self, text_query, k=
|
| 85 |
"""
|
| 86 |
Recommends movies based on a raw text description.
|
| 87 |
Example: "A romantic movie about a sinking ship" -> Titanic
|
|
|
|
| 81 |
vec = self.encoder.encode([movie_row.iloc[0]['soup']])
|
| 82 |
return self.search(vec, k)
|
| 83 |
|
| 84 |
+
def recommend_on_text(self, text_query, k=10):
|
| 85 |
"""
|
| 86 |
Recommends movies based on a raw text description.
|
| 87 |
Example: "A romantic movie about a sinking ship" -> Titanic
|
test.py
CHANGED
|
@@ -7,7 +7,7 @@ rec.load('models/')
|
|
| 7 |
# Assuming 'rec' is your loaded MovieRecommender instance
|
| 8 |
|
| 9 |
# Example 1: Vague description
|
| 10 |
-
print(rec.recommend_on_text("
|
| 11 |
|
| 12 |
|
| 13 |
# # Example 2: Specific vibe
|
|
|
|
| 7 |
# Assuming 'rec' is your loaded MovieRecommender instance
|
| 8 |
|
| 9 |
# Example 1: Vague description
|
| 10 |
+
print(rec.recommend_on_text("comedy about a group of friends going on an adventure"))
|
| 11 |
|
| 12 |
|
| 13 |
# # Example 2: Specific vibe
|
test2.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.recommender import MovieRecommender
|
| 2 |
+
|
| 3 |
+
def test_vibe():
|
| 4 |
+
# 1. Load the new brain
|
| 5 |
+
print("๐ง Loading the new high-quality brain...")
|
| 6 |
+
rec = MovieRecommender()
|
| 7 |
+
rec.load('models')
|
| 8 |
+
# print(f"โ
Loaded {len(rec.df)} movies.\n")
|
| 9 |
+
|
| 10 |
+
# 2. Define the Vibe
|
| 11 |
+
description = "christpher nolan style space adventure with mind bending visuals"
|
| 12 |
+
tags = ["Science Fiction", "Drama"]
|
| 13 |
+
|
| 14 |
+
# COMBINE THEM: Since your function only takes text, we mix them together.
|
| 15 |
+
# "Science Fiction Drama A space adventure..."
|
| 16 |
+
full_query = f"{' '.join(tags)} {description}"
|
| 17 |
+
|
| 18 |
+
print(f"๐ Searching for: '{full_query}'")
|
| 19 |
+
print("-" * 50)
|
| 20 |
+
|
| 21 |
+
# 3. Get Recommendations (Using YOUR function name)
|
| 22 |
+
results = rec.recommend_on_text(full_query, k=5)
|
| 23 |
+
|
| 24 |
+
# 4. Print results
|
| 25 |
+
for i, movie in enumerate(results):
|
| 26 |
+
print(f"{i+1}. {movie['title']} (Score: {movie['score']:.2f})")
|
| 27 |
+
|
| 28 |
+
if __name__ == "__main__":
|
| 29 |
+
test_vibe()
|