Upload 10 files
Browse files- README_movie_recommender.md +105 -0
- config.json +25 -0
- model.safetensors +3 -0
- modules.json +20 -0
- movie-recommendation-system.ipynb +1 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +65 -0
- vocab.txt +0 -0
README_movie_recommender.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Movie Recommendation System Using Content-Based Filtering
|
| 3 |
+
|
| 4 |
+
This repository hosts a content-based movie recommendation system built with Python. It uses metadata from a movie dataset to suggest similar movies based on features like genres, cast, crew, and keywords.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Model Details
|
| 9 |
+
|
| 10 |
+
- **Model Type:** Content-Based Recommendation System
|
| 11 |
+
- **Technique Used:** Cosine Similarity
|
| 12 |
+
- **Libraries:** Pandas, Scikit-learn, Numpy
|
| 13 |
+
- **Dataset:** TMDB 5000 Movie Dataset (or similar metadata-rich dataset)
|
| 14 |
+
- **Task:** Movie Recommendation based on content similarity
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## Usage
|
| 19 |
+
|
| 20 |
+
### Installation
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
pip install pandas scikit-learn numpy
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### Running the Model
|
| 27 |
+
|
| 28 |
+
```python
|
| 29 |
+
import pandas as pd
|
| 30 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 31 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 32 |
+
|
| 33 |
+
# Load dataset
|
| 34 |
+
movies = pd.read_csv('movies.csv')
|
| 35 |
+
|
| 36 |
+
# Combine relevant features into a single string
|
| 37 |
+
movies['combined_features'] = movies['genres'] + ' ' + movies['keywords'] + ' ' + movies['cast'] + ' ' + movies['crew']
|
| 38 |
+
|
| 39 |
+
# Vectorize features
|
| 40 |
+
vectorizer = CountVectorizer()
|
| 41 |
+
feature_vectors = vectorizer.fit_transform(movies['combined_features'])
|
| 42 |
+
|
| 43 |
+
# Compute similarity matrix
|
| 44 |
+
similarity = cosine_similarity(feature_vectors)
|
| 45 |
+
|
| 46 |
+
# Define recommendation function
|
| 47 |
+
def recommend(movie_name):
|
| 48 |
+
movie_index = movies[movies['title'] == movie_name].index[0]
|
| 49 |
+
distances = similarity[movie_index]
|
| 50 |
+
movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
|
| 51 |
+
for i in movie_list:
|
| 52 |
+
print(movies.iloc[i[0]].title)
|
| 53 |
+
|
| 54 |
+
# Example usage
|
| 55 |
+
recommend("Inception")
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
---
|
| 59 |
+
|
| 60 |
+
## Performance Metrics
|
| 61 |
+
|
| 62 |
+
This is a heuristic model and doesn't have standard ML performance metrics like accuracy or F1. Evaluation is subjective based on relevance and user satisfaction.
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
## Dataset Details
|
| 67 |
+
|
| 68 |
+
The dataset includes the following fields:
|
| 69 |
+
- Title
|
| 70 |
+
- Genres
|
| 71 |
+
- Keywords
|
| 72 |
+
- Cast
|
| 73 |
+
- Crew
|
| 74 |
+
|
| 75 |
+
Preprocessing includes:
|
| 76 |
+
- Removing nulls and duplicates
|
| 77 |
+
- Parsing nested JSON fields into readable text
|
| 78 |
+
- Combining features for vectorization
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
## Repository Structure
|
| 83 |
+
|
| 84 |
+
```
|
| 85 |
+
.
|
| 86 |
+
├── movies.csv # Dataset file
|
| 87 |
+
├── recommendation_system.ipynb # Main notebook
|
| 88 |
+
├── README.md # Documentation file
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## Limitations
|
| 94 |
+
|
| 95 |
+
- Not personalized; recommendations are the same for all users.
|
| 96 |
+
- Doesn't account for user ratings or feedback.
|
| 97 |
+
- Limited by the richness and correctness of metadata in the dataset.
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
## Contributing
|
| 102 |
+
|
| 103 |
+
Suggestions and improvements are welcome! Feel free to open issues or pull requests to help improve this project.
|
| 104 |
+
|
| 105 |
+
---
|
config.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"classifier_dropout": null,
|
| 7 |
+
"gradient_checkpointing": false,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.1,
|
| 10 |
+
"hidden_size": 384,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 1536,
|
| 13 |
+
"layer_norm_eps": 1e-12,
|
| 14 |
+
"max_position_embeddings": 512,
|
| 15 |
+
"model_type": "bert",
|
| 16 |
+
"num_attention_heads": 12,
|
| 17 |
+
"num_hidden_layers": 6,
|
| 18 |
+
"pad_token_id": 0,
|
| 19 |
+
"position_embedding_type": "absolute",
|
| 20 |
+
"torch_dtype": "float16",
|
| 21 |
+
"transformers_version": "4.51.3",
|
| 22 |
+
"type_vocab_size": 2,
|
| 23 |
+
"use_cache": true,
|
| 24 |
+
"vocab_size": 30522
|
| 25 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:176dd6e44ab1a04e2150a887324480eb0bc79fc47b05f5798dc78b79d7d4d80e
|
| 3 |
+
size 45437760
|
modules.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"idx": 0,
|
| 4 |
+
"name": "0",
|
| 5 |
+
"path": "",
|
| 6 |
+
"type": "sentence_transformers.models.Transformer"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"idx": 1,
|
| 10 |
+
"name": "1",
|
| 11 |
+
"path": "1_Pooling",
|
| 12 |
+
"type": "sentence_transformers.models.Pooling"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"idx": 2,
|
| 16 |
+
"name": "2",
|
| 17 |
+
"path": "2_Normalize",
|
| 18 |
+
"type": "sentence_transformers.models.Normalize"
|
| 19 |
+
}
|
| 20 |
+
]
|
movie-recommendation-system.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.11","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":11794992,"sourceType":"datasetVersion","datasetId":7406585}],"dockerImageVersionId":31041,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\nos.environ[\"WANDB_DISABLED\"] = \"true\"# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:41.906695Z","iopub.execute_input":"2025-05-13T11:06:41.907442Z","iopub.status.idle":"2025-05-13T11:06:41.928930Z","shell.execute_reply.started":"2025-05-13T11:06:41.907401Z","shell.execute_reply":"2025-05-13T11:06:41.928172Z"}},"outputs":[{"name":"stdout","text":"/kaggle/input/movie-dataset/ml-100k/u.occupation\n/kaggle/input/movie-dataset/ml-100k/u1.base\n/kaggle/input/movie-dataset/ml-100k/u.info\n/kaggle/input/movie-dataset/ml-100k/u4.test\n/kaggle/input/movie-dataset/ml-100k/u.item\n/kaggle/input/movie-dataset/ml-100k/README\n/kaggle/input/movie-dataset/ml-100k/u1.test\n/kaggle/input/movie-dataset/ml-100k/ua.test\n/kaggle/input/movie-dataset/ml-100k/u.data\n/kaggle/input/movie-dataset/ml-100k/u5.test\n/kaggle/input/movie-dataset/ml-100k/mku.sh\n/kaggle/input/movie-dataset/ml-100k/u5.base\n/kaggle/input/movie-dataset/ml-100k/u.user\n/kaggle/input/movie-dataset/ml-100k/ub.base\n/kaggle/input/movie-dataset/ml-100k/u4.base\n/kaggle/input/movie-dataset/ml-100k/u2.test\n/kaggle/input/movie-dataset/ml-100k/ua.base\n/kaggle/input/movie-dataset/ml-100k/u3.test\n/kaggle/input/movie-dataset/ml-100k/u.genre\n/kaggle/input/movie-dataset/ml-100k/allbut.pl\n/kaggle/input/movie-dataset/ml-100k/u3.base\n/kaggle/input/movie-dataset/ml-100k/u2.base\n/kaggle/input/movie-dataset/ml-100k/ub.test\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"!pip install pandas torch sentence-transformers scikit-learn\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:41.930136Z","iopub.execute_input":"2025-05-13T11:06:41.930340Z","iopub.status.idle":"2025-05-13T11:06:45.209043Z","shell.execute_reply.started":"2025-05-13T11:06:41.930325Z","shell.execute_reply":"2025-05-13T11:06:45.208262Z"}},"outputs":[{"name":"stderr","text":"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\nTo disable this warning, you can either:\n\t- Avoid using `tokenizers` before the fork if possible\n\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n","output_type":"stream"},{"name":"stdout","text":"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.3)\nRequirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.6.0+cu124)\nRequirement already satisfied: sentence-transformers in /usr/local/lib/python3.11/dist-packages (3.4.1)\nRequirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.2.2)\nRequirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (1.26.4)\nRequirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.9.0.post0)\nRequirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)\nRequirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)\nRequirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.18.0)\nRequirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.13.2)\nRequirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\nRequirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.6)\nRequirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2025.3.2)\nRequirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\nRequirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\nRequirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\nRequirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\nRequirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.5.8)\nRequirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch) (11.2.1.3)\nRequirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.5.147)\nRequirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch) (11.6.1.9)\nRequirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch) (12.3.1.170)\nRequirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.11/dist-packages (from torch) (0.6.2)\nRequirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\nRequirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\nRequirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\nRequirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.2.0)\nRequirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\nRequirement already satisfied: transformers<5.0.0,>=4.41.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (4.51.3)\nRequirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (4.67.1)\nRequirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (1.15.2)\nRequirement already satisfied: huggingface-hub>=0.20.0 in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (0.31.1)\nRequirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from sentence-transformers) (11.1.0)\nRequirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.5.0)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.6.0)\nRequirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (25.0)\nRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.2)\nRequirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3)\nRequirement already satisfied: hf-xet<2.0.0,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.0)\nRequirement already satisfied: mkl_fft in /usr/local/lib/python3.11/dist-packages (from numpy>=1.23.2->pandas) (1.3.8)\nRequirement already satisfied: mkl_random in /usr/local/lib/python3.11/dist-packages (from numpy>=1.23.2->pandas) (1.2.4)\nRequirement already satisfied: mkl_umath in /usr/local/lib/python3.11/dist-packages (from numpy>=1.23.2->pandas) (0.1.1)\nRequirement already satisfied: mkl in /usr/local/lib/python3.11/dist-packages (from numpy>=1.23.2->pandas) (2025.1.0)\nRequirement already satisfied: tbb4py in /usr/local/lib/python3.11/dist-packages (from numpy>=1.23.2->pandas) (2022.1.0)\nRequirement already satisfied: mkl-service in /usr/local/lib/python3.11/dist-packages (from numpy>=1.23.2->pandas) (2.4.1)\nRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\nRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2024.11.6)\nRequirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.21.1)\nRequirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.3)\nRequirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\nRequirement already satisfied: intel-openmp<2026,>=2024 in /usr/local/lib/python3.11/dist-packages (from mkl->numpy>=1.23.2->pandas) (2024.2.0)\nRequirement already satisfied: tbb==2022.* in /usr/local/lib/python3.11/dist-packages (from mkl->numpy>=1.23.2->pandas) (2022.1.0)\nRequirement already satisfied: tcmlib==1.* in /usr/local/lib/python3.11/dist-packages (from tbb==2022.*->mkl->numpy>=1.23.2->pandas) (1.3.0)\nRequirement already satisfied: intel-cmplr-lib-rt in /usr/local/lib/python3.11/dist-packages (from mkl_umath->numpy>=1.23.2->pandas) (2024.2.0)\nRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.4.2)\nRequirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.4.0)\nRequirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26)\nRequirement already satisfied: intel-cmplr-lib-ur==2024.2.0 in /usr/local/lib/python3.11/dist-packages (from intel-openmp<2026,>=2024->mkl->numpy>=1.23.2->pandas) (2024.2.0)\n","output_type":"stream"}],"execution_count":11},{"cell_type":"code","source":"import pandas as pd\nimport torch\nfrom sentence_transformers import SentenceTransformer, InputExample, losses, util\nfrom torch.utils.data import DataLoader\nfrom sklearn.metrics import precision_score, recall_score\nimport random\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.210129Z","iopub.execute_input":"2025-05-13T11:06:45.210455Z","iopub.status.idle":"2025-05-13T11:06:45.215187Z","shell.execute_reply.started":"2025-05-13T11:06:45.210419Z","shell.execute_reply":"2025-05-13T11:06:45.214336Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":"device = 'cuda' if torch.cuda.is_available() else 'cpu'\nprint(\"Using device:\", device)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.216893Z","iopub.execute_input":"2025-05-13T11:06:45.217321Z","iopub.status.idle":"2025-05-13T11:06:45.228457Z","shell.execute_reply.started":"2025-05-13T11:06:45.217304Z","shell.execute_reply":"2025-05-13T11:06:45.227901Z"}},"outputs":[{"name":"stdout","text":"Using device: cuda\n","output_type":"stream"}],"execution_count":13},{"cell_type":"code","source":"ratings = pd.read_csv(\"/kaggle/input/movie-dataset/ml-100k/u.data\", sep=\"\\t\", names=[\"user_id\", \"movie_id\", \"rating\", \"timestamp\"])\nmovies = pd.read_csv(\"/kaggle/input/movie-dataset/ml-100k/u.item\", sep=\"|\", encoding=\"latin-1\", header=None,\n names=[\"movie_id\", \"title\", \"release_date\", \"video_release_date\", \"IMDb_URL\",\n \"unknown\", \"Action\", \"Adventure\", \"Animation\", \"Children's\", \"Comedy\", \n \"Crime\", \"Documentary\", \"Drama\", \"Fantasy\", \"Film-Noir\", \"Horror\", \n \"Musical\", \"Mystery\", \"Romance\", \"Sci-Fi\", \"Thriller\", \"War\", \"Western\"])\n\ngenre_cols = [\"Action\", \"Adventure\", \"Animation\", \"Children's\", \"Comedy\", \n \"Crime\", \"Documentary\", \"Drama\", \"Fantasy\", \"Film-Noir\", \n \"Horror\", \"Musical\", \"Mystery\", \"Romance\", \"Sci-Fi\", \"Thriller\", \n \"War\", \"Western\"]\n\nmovies[\"genres\"] = movies[genre_cols].apply(lambda row: ', '.join([genre for genre in genre_cols if row[genre] == 1]), axis=1)\nmovies[\"text\"] = movies[\"title\"] + \" | \" + movies[\"genres\"]\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.229146Z","iopub.execute_input":"2025-05-13T11:06:45.229314Z","iopub.status.idle":"2025-05-13T11:06:45.349883Z","shell.execute_reply.started":"2025-05-13T11:06:45.229301Z","shell.execute_reply":"2025-05-13T11:06:45.349311Z"}},"outputs":[],"execution_count":14},{"cell_type":"code","source":"# Sample 20K ratings and get movie subset\nratings_20k = ratings.sample(n=20000, random_state=42)\nmovie_subset = movies[movies[\"movie_id\"].isin(ratings_20k[\"movie_id\"].unique())].copy().reset_index(drop=True)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.350621Z","iopub.execute_input":"2025-05-13T11:06:45.350842Z","iopub.status.idle":"2025-05-13T11:06:45.362588Z","shell.execute_reply.started":"2025-05-13T11:06:45.350825Z","shell.execute_reply":"2025-05-13T11:06:45.361948Z"}},"outputs":[],"execution_count":15},{"cell_type":"code","source":"# Create a dictionary: genre -> list of indices\ngenre_to_indices = {}\nfor idx, row in movie_subset.iterrows():\n for genre in row['genres'].split(', '):\n genre_to_indices.setdefault(genre, []).append(idx)\n\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.363330Z","iopub.execute_input":"2025-05-13T11:06:45.363612Z","iopub.status.idle":"2025-05-13T11:06:45.424972Z","shell.execute_reply.started":"2025-05-13T11:06:45.363585Z","shell.execute_reply":"2025-05-13T11:06:45.424485Z"}},"outputs":[],"execution_count":16},{"cell_type":"code","source":"triplets = []\n\nfor anchor_idx, anchor_row in movie_subset.iterrows():\n anchor_text = anchor_row[\"text\"]\n anchor_genres = anchor_row[\"genres\"].split(', ')\n \n # Positive: same genre\n possible_positives = set()\n for g in anchor_genres:\n possible_positives.update(genre_to_indices.get(g, []))\n possible_positives.discard(anchor_idx)\n \n if not possible_positives:\n continue\n positive_idx = random.choice(list(possible_positives))\n positive_text = movie_subset.iloc[positive_idx][\"text\"]\n\n # Negative: different genre\n all_indices = set(range(len(movie_subset)))\n same_genre_indices = {i for g in anchor_genres for i in genre_to_indices.get(g, [])}\n possible_negatives = list(all_indices - same_genre_indices)\n if not possible_negatives:\n continue\n negative_idx = random.choice(possible_negatives)\n negative_text = movie_subset.iloc[negative_idx][\"text\"]\n\n triplets.append(InputExample(texts=[anchor_text, positive_text, negative_text]))\n\nprint(f\"Generated {len(triplets)} triplets.\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.425539Z","iopub.execute_input":"2025-05-13T11:06:45.425705Z","iopub.status.idle":"2025-05-13T11:06:45.939253Z","shell.execute_reply.started":"2025-05-13T11:06:45.425692Z","shell.execute_reply":"2025-05-13T11:06:45.938578Z"}},"outputs":[{"name":"stdout","text":"Generated 1410 triplets.\n","output_type":"stream"}],"execution_count":17},{"cell_type":"code","source":"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)\ntrain_dataloader = DataLoader(triplets, shuffle=True, batch_size=16)\ntrain_loss = losses.TripletLoss(model)\n\nmodel.fit(\n train_objectives=[(train_dataloader, train_loss)],\n epochs=2,\n warmup_steps=100,\n show_progress_bar=True\n)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:06:45.941007Z","iopub.execute_input":"2025-05-13T11:06:45.941227Z","iopub.status.idle":"2025-05-13T11:07:00.644338Z","shell.execute_reply.started":"2025-05-13T11:06:45.941211Z","shell.execute_reply":"2025-05-13T11:07:00.643633Z"}},"outputs":[{"name":"stderr","text":"Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).\nUsing the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Computing widget examples: 0%| | 0/1 [00:00<?, ?example/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":""}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<IPython.core.display.HTML object>","text/html":"\n <div>\n \n <progress value='90' max='90' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [90/90 00:11, Epoch 2/2]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Step</th>\n <th>Training Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>"},"metadata":{}}],"execution_count":18},{"cell_type":"code","source":"movie_embeddings = model.encode(movie_subset[\"text\"].tolist(), convert_to_tensor=True, device=device)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:07:15.033833Z","iopub.execute_input":"2025-05-13T11:07:15.034158Z","iopub.status.idle":"2025-05-13T11:07:15.410689Z","shell.execute_reply.started":"2025-05-13T11:07:15.034136Z","shell.execute_reply":"2025-05-13T11:07:15.409893Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"Batches: 0%| | 0/45 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7bd11c3ed0104bc584d977e3ed837421"}},"metadata":{}}],"execution_count":19},{"cell_type":"code","source":"from difflib import get_close_matches\n\ndef recommend_by_movie_name(movie_name, top_k=5):\n titles = movie_subset[\"title\"].tolist()\n matches = get_close_matches(movie_name, titles, n=1, cutoff=0.6)\n \n if not matches:\n print(f\"❌ Movie '{movie_name}' not found in dataset.\")\n return\n \n matched_title = matches[0]\n movie_index = movie_subset[movie_subset[\"title\"] == matched_title].index[0]\n \n query_embedding = movie_embeddings[movie_index]\n scores = util.pytorch_cos_sim(query_embedding, movie_embeddings)[0]\n top_results = torch.topk(scores, k=top_k + 1)\n\n print(f\"\\n🎬 Recommendations for: {matched_title}\")\n for score, idx_tensor in zip(top_results[0][1:], top_results[1][1:]): # skip itself\n idx = idx_tensor.item() # ✅ Convert tensor to int\n title = movie_subset.iloc[idx][\"title\"]\n print(f\" {title} (Score: {score:.4f})\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:10:22.535057Z","iopub.execute_input":"2025-05-13T11:10:22.535746Z","iopub.status.idle":"2025-05-13T11:10:22.541199Z","shell.execute_reply.started":"2025-05-13T11:10:22.535721Z","shell.execute_reply":"2025-05-13T11:10:22.540469Z"}},"outputs":[],"execution_count":23},{"cell_type":"code","source":"recommend_by_movie_name(\"Toy Story\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:10:43.788134Z","iopub.execute_input":"2025-05-13T11:10:43.788936Z","iopub.status.idle":"2025-05-13T11:10:43.800396Z","shell.execute_reply.started":"2025-05-13T11:10:43.788911Z","shell.execute_reply":"2025-05-13T11:10:43.799304Z"}},"outputs":[{"name":"stdout","text":"\n🎬 Recommendations for: Toy Story (1995)\n Grand Day Out, A (1992) (Score: 0.9964)\n Wrong Trousers, The (1993) (Score: 0.9961)\n Aladdin and the King of Thieves (1996) (Score: 0.9958)\n Santa Clause, The (1994) (Score: 0.9944)\n Beavis and Butt-head Do America (1996) (Score: 0.9944)\n","output_type":"stream"}],"execution_count":24},{"cell_type":"code","source":"recommend_by_movie_name(\"Star Wars\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:11:04.754351Z","iopub.execute_input":"2025-05-13T11:11:04.754941Z","iopub.status.idle":"2025-05-13T11:11:04.765782Z","shell.execute_reply.started":"2025-05-13T11:11:04.754919Z","shell.execute_reply":"2025-05-13T11:11:04.765098Z"}},"outputs":[{"name":"stdout","text":"\n🎬 Recommendations for: Star Wars (1977)\n Return of the Jedi (1983) (Score: 0.9921)\n Starship Troopers (1997) (Score: 0.9697)\n Star Trek: The Wrath of Khan (1982) (Score: 0.9516)\n African Queen, The (1951) (Score: 0.9506)\n Stargate (1994) (Score: 0.9505)\n","output_type":"stream"}],"execution_count":25},{"cell_type":"code","source":"model_save_path = \"fine_tuned_movie_model\"\nmodel.save(model_save_path)\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:12:57.600201Z","iopub.execute_input":"2025-05-13T11:12:57.600968Z","iopub.status.idle":"2025-05-13T11:12:57.887259Z","shell.execute_reply.started":"2025-05-13T11:12:57.600941Z","shell.execute_reply":"2025-05-13T11:12:57.886732Z"}},"outputs":[],"execution_count":26},{"cell_type":"code","source":"quantized_model = model.to(dtype=torch.float16, device=device)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:14:23.425667Z","iopub.execute_input":"2025-05-13T11:14:23.425935Z","iopub.status.idle":"2025-05-13T11:14:23.433812Z","shell.execute_reply.started":"2025-05-13T11:14:23.425917Z","shell.execute_reply":"2025-05-13T11:14:23.433212Z"}},"outputs":[],"execution_count":27},{"cell_type":"code","source":"quantized_model.save_pretrained('quantized-model')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:14:41.626374Z","iopub.execute_input":"2025-05-13T11:14:41.626919Z","iopub.status.idle":"2025-05-13T11:14:41.770979Z","shell.execute_reply.started":"2025-05-13T11:14:41.626896Z","shell.execute_reply":"2025-05-13T11:14:41.770242Z"}},"outputs":[],"execution_count":28},{"cell_type":"code","source":"movie_embeddings = quantized_model.encode(movie_subset[\"text\"].tolist(), convert_to_tensor=True, device=device)\nrecommend_by_movie_name(\"Star Wars\")\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-05-13T11:18:08.946660Z","iopub.execute_input":"2025-05-13T11:18:08.947551Z","iopub.status.idle":"2025-05-13T11:18:09.526339Z","shell.execute_reply.started":"2025-05-13T11:18:08.947516Z","shell.execute_reply":"2025-05-13T11:18:09.525652Z"}},"outputs":[{"output_type":"display_data","data":{"text/plain":"Batches: 0%| | 0/45 [00:00<?, ?it/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6b12f59b0a3c4887825cd157de824f0b"}},"metadata":{}},{"name":"stdout","text":"\n🎬 Recommendations for: Star Wars (1977)\n Return of the Jedi (1983) (Score: 0.9927)\n Starship Troopers (1997) (Score: 0.9702)\n Star Trek: The Wrath of Khan (1982) (Score: 0.9521)\n African Queen, The (1951) (Score: 0.9512)\n Stargate (1994) (Score: 0.9507)\n","output_type":"stream"}],"execution_count":29}]}
|
sentence_bert_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"max_seq_length": 256,
|
| 3 |
+
"do_lower_case": false
|
| 4 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": {
|
| 3 |
+
"content": "[CLS]",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"mask_token": {
|
| 10 |
+
"content": "[MASK]",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "[SEP]",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"unk_token": {
|
| 31 |
+
"content": "[UNK]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_basic_tokenize": true,
|
| 47 |
+
"do_lower_case": true,
|
| 48 |
+
"extra_special_tokens": {},
|
| 49 |
+
"mask_token": "[MASK]",
|
| 50 |
+
"max_length": 128,
|
| 51 |
+
"model_max_length": 256,
|
| 52 |
+
"never_split": null,
|
| 53 |
+
"pad_to_multiple_of": null,
|
| 54 |
+
"pad_token": "[PAD]",
|
| 55 |
+
"pad_token_type_id": 0,
|
| 56 |
+
"padding_side": "right",
|
| 57 |
+
"sep_token": "[SEP]",
|
| 58 |
+
"stride": 0,
|
| 59 |
+
"strip_accents": null,
|
| 60 |
+
"tokenize_chinese_chars": true,
|
| 61 |
+
"tokenizer_class": "BertTokenizer",
|
| 62 |
+
"truncation_side": "right",
|
| 63 |
+
"truncation_strategy": "longest_first",
|
| 64 |
+
"unk_token": "[UNK]"
|
| 65 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|