Smiley0707 commited on
Commit
8a794d1
·
verified ·
1 Parent(s): 1e75d5a

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +3 -0
  2. TMDB.csv +3 -0
  3. feature_array.npz +3 -0
  4. meow.ipynb +102 -0
  5. movie.ipynb +1674 -0
  6. movies_df.csv +3 -0
  7. my_index.ann +3 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ movies_df.csv filter=lfs diff=lfs merge=lfs -text
37
+ my_index.ann filter=lfs diff=lfs merge=lfs -text
38
+ TMDB.csv filter=lfs diff=lfs merge=lfs -text
TMDB.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbb74d9630803fa76200511ee5fed3f23193b9ffdcb5999578f735962bcbcb57
3
+ size 570252856
feature_array.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4211dd1420c9a800b52de53d7e82417c2fe223def70ef57b1624a024f7f9fe21
3
+ size 88777756
meow.ipynb ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "3afa097a",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Time and Space\n",
14
+ "A Monster From Space\n",
15
+ "Budhayaan\n",
16
+ "Manhunt in Space\n",
17
+ "Journey to Space\n"
18
+ ]
19
+ }
20
+ ],
21
+ "source": [
22
+ "from annoy import AnnoyIndex\n",
23
+ "import numpy as np\n",
24
+ "import pandas as pd\n",
25
+ "\n",
26
+ "# Load your movie data and feature shape\n",
27
+ "new_movies = pd.read_csv('movies_df.csv')\n",
28
+ "feature_array = np.load('feature_array.npz')['arr_0']\n",
29
+ "f = feature_array.shape[1] # Feature dimension\n",
30
+ "\n",
31
+ "# Create Annoy index and load from file\n",
32
+ "annoy_index = AnnoyIndex(f, 'angular')\n",
33
+ "annoy_index.load('my_index.ann')\n",
34
+ "\n",
35
+ "def get_movie_title(movie_id):\n",
36
+ " return new_movies.iloc[movie_id]['title']\n",
37
+ "\n",
38
+ "def recommend_fast(movie_id, top_k=5):\n",
39
+ " idxs = annoy_index.get_nns_by_item(movie_id, top_k+1)[1:] # skip itself\n",
40
+ " return idxs\n",
41
+ "\n",
42
+ "# Example: Get recommendations for any movie\n",
43
+ "recommended_movies = recommend_fast(1)\n",
44
+ "for mv in recommended_movies:\n",
45
+ " print(get_movie_title(mv))\n"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 3,
51
+ "id": "39068f98",
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "Time and Space\n",
59
+ "A Monster From Space\n",
60
+ "Budhayaan\n",
61
+ "Manhunt in Space\n",
62
+ "Journey to Space\n"
63
+ ]
64
+ }
65
+ ],
66
+ "source": [
67
+ "recommended_movies = recommend_fast(1)\n",
68
+ "for mv in recommended_movies:\n",
69
+ " print(get_movie_title(mv))\n"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "id": "f22a4417",
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": []
79
+ }
80
+ ],
81
+ "metadata": {
82
+ "kernelspec": {
83
+ "display_name": "Python 3",
84
+ "language": "python",
85
+ "name": "python3"
86
+ },
87
+ "language_info": {
88
+ "codemirror_mode": {
89
+ "name": "ipython",
90
+ "version": 3
91
+ },
92
+ "file_extension": ".py",
93
+ "mimetype": "text/x-python",
94
+ "name": "python",
95
+ "nbconvert_exporter": "python",
96
+ "pygments_lexer": "ipython3",
97
+ "version": "3.13.9"
98
+ }
99
+ },
100
+ "nbformat": 4,
101
+ "nbformat_minor": 5
102
+ }
movie.ipynb ADDED
@@ -0,0 +1,1674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "c9c09288",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Defaulting to user installation because normal site-packages is not writeable\n",
14
+ "Requirement already satisfied: pandas in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (2.3.2)\n",
15
+ "Requirement already satisfied: numpy in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (2.2.4)\n",
16
+ "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from pandas) (2.9.0.post0)\n",
17
+ "Requirement already satisfied: pytz>=2020.1 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from pandas) (2025.2)\n",
18
+ "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from pandas) (2025.2)\n",
19
+ "Requirement already satisfied: six>=1.5 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
20
+ ]
21
+ },
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "\n",
27
+ "[notice] A new release of pip is available: 25.0.1 -> 25.2\n",
28
+ "[notice] To update, run: C:\\Users\\unkno\\AppData\\Local\\Microsoft\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\python.exe -m pip install --upgrade pip\n"
29
+ ]
30
+ }
31
+ ],
32
+ "source": [
33
+ "!pip install pandas numpy"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 2,
39
+ "id": "0f6a3484",
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "import pandas as pd"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 3,
49
+ "id": "ae7debab",
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "data": {
54
+ "text/html": [
55
+ "<div>\n",
56
+ "<style scoped>\n",
57
+ " .dataframe tbody tr th:only-of-type {\n",
58
+ " vertical-align: middle;\n",
59
+ " }\n",
60
+ "\n",
61
+ " .dataframe tbody tr th {\n",
62
+ " vertical-align: top;\n",
63
+ " }\n",
64
+ "\n",
65
+ " .dataframe thead th {\n",
66
+ " text-align: right;\n",
67
+ " }\n",
68
+ "</style>\n",
69
+ "<table border=\"1\" class=\"dataframe\">\n",
70
+ " <thead>\n",
71
+ " <tr style=\"text-align: right;\">\n",
72
+ " <th></th>\n",
73
+ " <th>id</th>\n",
74
+ " <th>title</th>\n",
75
+ " <th>vote_average</th>\n",
76
+ " <th>vote_count</th>\n",
77
+ " <th>status</th>\n",
78
+ " <th>release_date</th>\n",
79
+ " <th>revenue</th>\n",
80
+ " <th>runtime</th>\n",
81
+ " <th>adult</th>\n",
82
+ " <th>backdrop_path</th>\n",
83
+ " <th>...</th>\n",
84
+ " <th>original_title</th>\n",
85
+ " <th>overview</th>\n",
86
+ " <th>popularity</th>\n",
87
+ " <th>poster_path</th>\n",
88
+ " <th>tagline</th>\n",
89
+ " <th>genres</th>\n",
90
+ " <th>production_companies</th>\n",
91
+ " <th>production_countries</th>\n",
92
+ " <th>spoken_languages</th>\n",
93
+ " <th>keywords</th>\n",
94
+ " </tr>\n",
95
+ " </thead>\n",
96
+ " <tbody>\n",
97
+ " <tr>\n",
98
+ " <th>0</th>\n",
99
+ " <td>27205</td>\n",
100
+ " <td>Inception</td>\n",
101
+ " <td>8.364</td>\n",
102
+ " <td>34495</td>\n",
103
+ " <td>Released</td>\n",
104
+ " <td>2010-07-15</td>\n",
105
+ " <td>825532764</td>\n",
106
+ " <td>148</td>\n",
107
+ " <td>False</td>\n",
108
+ " <td>/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg</td>\n",
109
+ " <td>...</td>\n",
110
+ " <td>Inception</td>\n",
111
+ " <td>Cobb, a skilled thief who commits corporate es...</td>\n",
112
+ " <td>83.952</td>\n",
113
+ " <td>/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg</td>\n",
114
+ " <td>Your mind is the scene of the crime.</td>\n",
115
+ " <td>Action, Science Fiction, Adventure</td>\n",
116
+ " <td>Legendary Pictures, Syncopy, Warner Bros. Pict...</td>\n",
117
+ " <td>United Kingdom, United States of America</td>\n",
118
+ " <td>English, French, Japanese, Swahili</td>\n",
119
+ " <td>rescue, mission, dream, airplane, paris, franc...</td>\n",
120
+ " </tr>\n",
121
+ " </tbody>\n",
122
+ "</table>\n",
123
+ "<p>1 rows × 24 columns</p>\n",
124
+ "</div>"
125
+ ],
126
+ "text/plain": [
127
+ " id title vote_average vote_count status release_date \\\n",
128
+ "0 27205 Inception 8.364 34495 Released 2010-07-15 \n",
129
+ "\n",
130
+ " revenue runtime adult backdrop_path ... \\\n",
131
+ "0 825532764 148 False /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg ... \n",
132
+ "\n",
133
+ " original_title overview \\\n",
134
+ "0 Inception Cobb, a skilled thief who commits corporate es... \n",
135
+ "\n",
136
+ " popularity poster_path \\\n",
137
+ "0 83.952 /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg \n",
138
+ "\n",
139
+ " tagline genres \\\n",
140
+ "0 Your mind is the scene of the crime. Action, Science Fiction, Adventure \n",
141
+ "\n",
142
+ " production_companies \\\n",
143
+ "0 Legendary Pictures, Syncopy, Warner Bros. Pict... \n",
144
+ "\n",
145
+ " production_countries \\\n",
146
+ "0 United Kingdom, United States of America \n",
147
+ "\n",
148
+ " spoken_languages \\\n",
149
+ "0 English, French, Japanese, Swahili \n",
150
+ "\n",
151
+ " keywords \n",
152
+ "0 rescue, mission, dream, airplane, paris, franc... \n",
153
+ "\n",
154
+ "[1 rows x 24 columns]"
155
+ ]
156
+ },
157
+ "execution_count": 3,
158
+ "metadata": {},
159
+ "output_type": "execute_result"
160
+ }
161
+ ],
162
+ "source": [
163
+ "movies = pd.read_csv(\"TMDB.csv\")\n",
164
+ "movies.head(1)"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 4,
170
+ "id": "0fece246",
171
+ "metadata": {},
172
+ "outputs": [
173
+ {
174
+ "data": {
175
+ "text/html": [
176
+ "<div>\n",
177
+ "<style scoped>\n",
178
+ " .dataframe tbody tr th:only-of-type {\n",
179
+ " vertical-align: middle;\n",
180
+ " }\n",
181
+ "\n",
182
+ " .dataframe tbody tr th {\n",
183
+ " vertical-align: top;\n",
184
+ " }\n",
185
+ "\n",
186
+ " .dataframe thead th {\n",
187
+ " text-align: right;\n",
188
+ " }\n",
189
+ "</style>\n",
190
+ "<table border=\"1\" class=\"dataframe\">\n",
191
+ " <thead>\n",
192
+ " <tr style=\"text-align: right;\">\n",
193
+ " <th></th>\n",
194
+ " <th>id</th>\n",
195
+ " <th>title</th>\n",
196
+ " <th>overview</th>\n",
197
+ " <th>poster_path</th>\n",
198
+ " <th>backdrop_path</th>\n",
199
+ " <th>tagline</th>\n",
200
+ " <th>genres</th>\n",
201
+ " <th>keywords</th>\n",
202
+ " </tr>\n",
203
+ " </thead>\n",
204
+ " <tbody>\n",
205
+ " <tr>\n",
206
+ " <th>0</th>\n",
207
+ " <td>27205</td>\n",
208
+ " <td>Inception</td>\n",
209
+ " <td>Cobb, a skilled thief who commits corporate es...</td>\n",
210
+ " <td>/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg</td>\n",
211
+ " <td>/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg</td>\n",
212
+ " <td>Your mind is the scene of the crime.</td>\n",
213
+ " <td>Action, Science Fiction, Adventure</td>\n",
214
+ " <td>rescue, mission, dream, airplane, paris, franc...</td>\n",
215
+ " </tr>\n",
216
+ " <tr>\n",
217
+ " <th>1</th>\n",
218
+ " <td>157336</td>\n",
219
+ " <td>Interstellar</td>\n",
220
+ " <td>The adventures of a group of explorers who mak...</td>\n",
221
+ " <td>/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg</td>\n",
222
+ " <td>/pbrkL804c8yAv3zBZR4QPEafpAR.jpg</td>\n",
223
+ " <td>Mankind was born on Earth. It was never meant ...</td>\n",
224
+ " <td>Adventure, Drama, Science Fiction</td>\n",
225
+ " <td>rescue, future, spacecraft, race against time,...</td>\n",
226
+ " </tr>\n",
227
+ " <tr>\n",
228
+ " <th>2</th>\n",
229
+ " <td>155</td>\n",
230
+ " <td>The Dark Knight</td>\n",
231
+ " <td>Batman raises the stakes in his war on crime. ...</td>\n",
232
+ " <td>/qJ2tW6WMUDux911r6m7haRef0WH.jpg</td>\n",
233
+ " <td>/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg</td>\n",
234
+ " <td>Welcome to a world without rules.</td>\n",
235
+ " <td>Drama, Action, Crime, Thriller</td>\n",
236
+ " <td>joker, sadism, chaos, secret identity, crime f...</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>3</th>\n",
240
+ " <td>19995</td>\n",
241
+ " <td>Avatar</td>\n",
242
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
243
+ " <td>/kyeqWdyUXW608qlYkRqosgbbJyK.jpg</td>\n",
244
+ " <td>/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg</td>\n",
245
+ " <td>Enter the world of Pandora.</td>\n",
246
+ " <td>Action, Adventure, Fantasy, Science Fiction</td>\n",
247
+ " <td>future, society, culture clash, space travel, ...</td>\n",
248
+ " </tr>\n",
249
+ " <tr>\n",
250
+ " <th>4</th>\n",
251
+ " <td>24428</td>\n",
252
+ " <td>The Avengers</td>\n",
253
+ " <td>When an unexpected enemy emerges and threatens...</td>\n",
254
+ " <td>/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg</td>\n",
255
+ " <td>/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg</td>\n",
256
+ " <td>Some assembly required.</td>\n",
257
+ " <td>Science Fiction, Action, Adventure</td>\n",
258
+ " <td>new york city, superhero, shield, based on com...</td>\n",
259
+ " </tr>\n",
260
+ " <tr>\n",
261
+ " <th>...</th>\n",
262
+ " <td>...</td>\n",
263
+ " <td>...</td>\n",
264
+ " <td>...</td>\n",
265
+ " <td>...</td>\n",
266
+ " <td>...</td>\n",
267
+ " <td>...</td>\n",
268
+ " <td>...</td>\n",
269
+ " <td>...</td>\n",
270
+ " </tr>\n",
271
+ " <tr>\n",
272
+ " <th>1266860</th>\n",
273
+ " <td>796654</td>\n",
274
+ " <td>Numéro Complémentaire</td>\n",
275
+ " <td>NaN</td>\n",
276
+ " <td>/xR5oBqeGUHszPSuzhf6Xqb74UT2.jpg</td>\n",
277
+ " <td>NaN</td>\n",
278
+ " <td>NaN</td>\n",
279
+ " <td>Comedy</td>\n",
280
+ " <td>NaN</td>\n",
281
+ " </tr>\n",
282
+ " <tr>\n",
283
+ " <th>1266861</th>\n",
284
+ " <td>796656</td>\n",
285
+ " <td>Heimat Tansania - Unter dem Kilimandscharo</td>\n",
286
+ " <td>NaN</td>\n",
287
+ " <td>NaN</td>\n",
288
+ " <td>NaN</td>\n",
289
+ " <td>NaN</td>\n",
290
+ " <td>Documentary</td>\n",
291
+ " <td>NaN</td>\n",
292
+ " </tr>\n",
293
+ " <tr>\n",
294
+ " <th>1266862</th>\n",
295
+ " <td>796657</td>\n",
296
+ " <td>Car Crash: Who's Lying?</td>\n",
297
+ " <td>Documentary exploring the aftermath of a car c...</td>\n",
298
+ " <td>/yatOGyk6OZKtfZQlRIBnHYwfSHD.jpg</td>\n",
299
+ " <td>NaN</td>\n",
300
+ " <td>NaN</td>\n",
301
+ " <td>Documentary</td>\n",
302
+ " <td>NaN</td>\n",
303
+ " </tr>\n",
304
+ " <tr>\n",
305
+ " <th>1266863</th>\n",
306
+ " <td>796658</td>\n",
307
+ " <td>Cowboy</td>\n",
308
+ " <td>Having grown up in a home with an abusive fath...</td>\n",
309
+ " <td>/fhPUn11H9wU0cGWLpo6WMcMXb9a.jpg</td>\n",
310
+ " <td>NaN</td>\n",
311
+ " <td>NaN</td>\n",
312
+ " <td>Documentary, Animation</td>\n",
313
+ " <td>NaN</td>\n",
314
+ " </tr>\n",
315
+ " <tr>\n",
316
+ " <th>1266864</th>\n",
317
+ " <td>1525030</td>\n",
318
+ " <td>Detective Kibbles: Part 2 | Remastered</td>\n",
319
+ " <td>After a brutal attack in his own home, Detecti...</td>\n",
320
+ " <td>/74SKZslJiJrO1nK4LnFa7ztesxZ.jpg</td>\n",
321
+ " <td>/q436vhtfFMJzaqkSgiy9wDhZ3FI.jpg</td>\n",
322
+ " <td>Every hero must have an end.</td>\n",
323
+ " <td>Crime, Comedy, Drama</td>\n",
324
+ " <td>police, detective, drama, kibbles</td>\n",
325
+ " </tr>\n",
326
+ " </tbody>\n",
327
+ "</table>\n",
328
+ "<p>1266865 rows × 8 columns</p>\n",
329
+ "</div>"
330
+ ],
331
+ "text/plain": [
332
+ " id title \\\n",
333
+ "0 27205 Inception \n",
334
+ "1 157336 Interstellar \n",
335
+ "2 155 The Dark Knight \n",
336
+ "3 19995 Avatar \n",
337
+ "4 24428 The Avengers \n",
338
+ "... ... ... \n",
339
+ "1266860 796654 Numéro Complémentaire \n",
340
+ "1266861 796656 Heimat Tansania - Unter dem Kilimandscharo \n",
341
+ "1266862 796657 Car Crash: Who's Lying? \n",
342
+ "1266863 796658 Cowboy \n",
343
+ "1266864 1525030 Detective Kibbles: Part 2 | Remastered \n",
344
+ "\n",
345
+ " overview \\\n",
346
+ "0 Cobb, a skilled thief who commits corporate es... \n",
347
+ "1 The adventures of a group of explorers who mak... \n",
348
+ "2 Batman raises the stakes in his war on crime. ... \n",
349
+ "3 In the 22nd century, a paraplegic Marine is di... \n",
350
+ "4 When an unexpected enemy emerges and threatens... \n",
351
+ "... ... \n",
352
+ "1266860 NaN \n",
353
+ "1266861 NaN \n",
354
+ "1266862 Documentary exploring the aftermath of a car c... \n",
355
+ "1266863 Having grown up in a home with an abusive fath... \n",
356
+ "1266864 After a brutal attack in his own home, Detecti... \n",
357
+ "\n",
358
+ " poster_path backdrop_path \\\n",
359
+ "0 /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg \n",
360
+ "1 /gEU2QniE6E77NI6lCU6MxlNBvIx.jpg /pbrkL804c8yAv3zBZR4QPEafpAR.jpg \n",
361
+ "2 /qJ2tW6WMUDux911r6m7haRef0WH.jpg /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg \n",
362
+ "3 /kyeqWdyUXW608qlYkRqosgbbJyK.jpg /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg \n",
363
+ "4 /RYMX2wcKCBAr24UyPD7xwmjaTn.jpg /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg \n",
364
+ "... ... ... \n",
365
+ "1266860 /xR5oBqeGUHszPSuzhf6Xqb74UT2.jpg NaN \n",
366
+ "1266861 NaN NaN \n",
367
+ "1266862 /yatOGyk6OZKtfZQlRIBnHYwfSHD.jpg NaN \n",
368
+ "1266863 /fhPUn11H9wU0cGWLpo6WMcMXb9a.jpg NaN \n",
369
+ "1266864 /74SKZslJiJrO1nK4LnFa7ztesxZ.jpg /q436vhtfFMJzaqkSgiy9wDhZ3FI.jpg \n",
370
+ "\n",
371
+ " tagline \\\n",
372
+ "0 Your mind is the scene of the crime. \n",
373
+ "1 Mankind was born on Earth. It was never meant ... \n",
374
+ "2 Welcome to a world without rules. \n",
375
+ "3 Enter the world of Pandora. \n",
376
+ "4 Some assembly required. \n",
377
+ "... ... \n",
378
+ "1266860 NaN \n",
379
+ "1266861 NaN \n",
380
+ "1266862 NaN \n",
381
+ "1266863 NaN \n",
382
+ "1266864 Every hero must have an end. \n",
383
+ "\n",
384
+ " genres \\\n",
385
+ "0 Action, Science Fiction, Adventure \n",
386
+ "1 Adventure, Drama, Science Fiction \n",
387
+ "2 Drama, Action, Crime, Thriller \n",
388
+ "3 Action, Adventure, Fantasy, Science Fiction \n",
389
+ "4 Science Fiction, Action, Adventure \n",
390
+ "... ... \n",
391
+ "1266860 Comedy \n",
392
+ "1266861 Documentary \n",
393
+ "1266862 Documentary \n",
394
+ "1266863 Documentary, Animation \n",
395
+ "1266864 Crime, Comedy, Drama \n",
396
+ "\n",
397
+ " keywords \n",
398
+ "0 rescue, mission, dream, airplane, paris, franc... \n",
399
+ "1 rescue, future, spacecraft, race against time,... \n",
400
+ "2 joker, sadism, chaos, secret identity, crime f... \n",
401
+ "3 future, society, culture clash, space travel, ... \n",
402
+ "4 new york city, superhero, shield, based on com... \n",
403
+ "... ... \n",
404
+ "1266860 NaN \n",
405
+ "1266861 NaN \n",
406
+ "1266862 NaN \n",
407
+ "1266863 NaN \n",
408
+ "1266864 police, detective, drama, kibbles \n",
409
+ "\n",
410
+ "[1266865 rows x 8 columns]"
411
+ ]
412
+ },
413
+ "execution_count": 4,
414
+ "metadata": {},
415
+ "output_type": "execute_result"
416
+ }
417
+ ],
418
+ "source": [
419
+ "movies = movies[[\"id\",\"title\",\"overview\",\"poster_path\",\"backdrop_path\",\"tagline\",\"genres\",\"keywords\"]]\n",
420
+ "movies"
421
+ ]
422
+ },
423
+ {
424
+ "cell_type": "code",
425
+ "execution_count": 5,
426
+ "id": "4217a767",
427
+ "metadata": {},
428
+ "outputs": [],
429
+ "source": [
430
+ "import re\n",
431
+ "\n",
432
+ "\n",
433
+ "def convert_string_list(value):\n",
434
+ " value = re.sub(r\"[^\\w\\s]\", \"\", value)\n",
435
+ " return \",\".join(value.split())"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 6,
441
+ "id": "8be31676",
442
+ "metadata": {},
443
+ "outputs": [],
444
+ "source": [
445
+ "movies = movies[movies['overview'].notna()]"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": 7,
451
+ "id": "ea71ec24",
452
+ "metadata": {},
453
+ "outputs": [
454
+ {
455
+ "data": {
456
+ "text/plain": [
457
+ "0 Cobb, a skilled thief who commits corporate es...\n",
458
+ "1 The adventures of a group of explorers who mak...\n",
459
+ "2 Batman raises the stakes in his war on crime. ...\n",
460
+ "3 In the 22nd century, a paraplegic Marine is di...\n",
461
+ "4 When an unexpected enemy emerges and threatens...\n",
462
+ " ... \n",
463
+ "1266856 A warrior is transported 1,000 years into the ...\n",
464
+ "1266857 The utilization of numerical oscillation model...\n",
465
+ "1266862 Documentary exploring the aftermath of a car c...\n",
466
+ "1266863 Having grown up in a home with an abusive fath...\n",
467
+ "1266864 After a brutal attack in his own home, Detecti...\n",
468
+ "Name: overview, Length: 991491, dtype: object"
469
+ ]
470
+ },
471
+ "execution_count": 7,
472
+ "metadata": {},
473
+ "output_type": "execute_result"
474
+ }
475
+ ],
476
+ "source": [
477
+ "movies['overview']"
478
+ ]
479
+ },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": 8,
483
+ "id": "cc7e4bc4",
484
+ "metadata": {},
485
+ "outputs": [
486
+ {
487
+ "name": "stderr",
488
+ "output_type": "stream",
489
+ "text": [
490
+ "C:\\Users\\unkno\\AppData\\Local\\Temp\\ipykernel_5356\\1915643281.py:1: SettingWithCopyWarning: \n",
491
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
492
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
493
+ "\n",
494
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
495
+ " movies['overview']=movies['overview'].apply(convert_string_list)\n"
496
+ ]
497
+ }
498
+ ],
499
+ "source": [
500
+ "movies['overview']=movies['overview'].apply(convert_string_list)\n"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": 9,
506
+ "id": "2428a165",
507
+ "metadata": {},
508
+ "outputs": [
509
+ {
510
+ "name": "stderr",
511
+ "output_type": "stream",
512
+ "text": [
513
+ "C:\\Users\\unkno\\AppData\\Local\\Temp\\ipykernel_5356\\42895326.py:1: SettingWithCopyWarning: \n",
514
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
515
+ "\n",
516
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
517
+ " movies.drop(columns=[\"backdrop_path\"], inplace=True)\n"
518
+ ]
519
+ }
520
+ ],
521
+ "source": [
522
+ "movies.drop(columns=[\"backdrop_path\"], inplace=True)"
523
+ ]
524
+ },
525
+ {
526
+ "cell_type": "code",
527
+ "execution_count": 10,
528
+ "id": "e7e4288e",
529
+ "metadata": {},
530
+ "outputs": [
531
+ {
532
+ "data": {
533
+ "text/html": [
534
+ "<div>\n",
535
+ "<style scoped>\n",
536
+ " .dataframe tbody tr th:only-of-type {\n",
537
+ " vertical-align: middle;\n",
538
+ " }\n",
539
+ "\n",
540
+ " .dataframe tbody tr th {\n",
541
+ " vertical-align: top;\n",
542
+ " }\n",
543
+ "\n",
544
+ " .dataframe thead th {\n",
545
+ " text-align: right;\n",
546
+ " }\n",
547
+ "</style>\n",
548
+ "<table border=\"1\" class=\"dataframe\">\n",
549
+ " <thead>\n",
550
+ " <tr style=\"text-align: right;\">\n",
551
+ " <th></th>\n",
552
+ " <th>id</th>\n",
553
+ " <th>title</th>\n",
554
+ " <th>overview</th>\n",
555
+ " <th>poster_path</th>\n",
556
+ " <th>tagline</th>\n",
557
+ " <th>genres</th>\n",
558
+ " <th>keywords</th>\n",
559
+ " </tr>\n",
560
+ " </thead>\n",
561
+ " <tbody>\n",
562
+ " <tr>\n",
563
+ " <th>0</th>\n",
564
+ " <td>27205</td>\n",
565
+ " <td>Inception</td>\n",
566
+ " <td>Cobb,a,skilled,thief,who,commits,corporate,esp...</td>\n",
567
+ " <td>/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg</td>\n",
568
+ " <td>Your mind is the scene of the crime.</td>\n",
569
+ " <td>Action, Science Fiction, Adventure</td>\n",
570
+ " <td>rescue, mission, dream, airplane, paris, franc...</td>\n",
571
+ " </tr>\n",
572
+ " <tr>\n",
573
+ " <th>1</th>\n",
574
+ " <td>157336</td>\n",
575
+ " <td>Interstellar</td>\n",
576
+ " <td>The,adventures,of,a,group,of,explorers,who,mak...</td>\n",
577
+ " <td>/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg</td>\n",
578
+ " <td>Mankind was born on Earth. It was never meant ...</td>\n",
579
+ " <td>Adventure, Drama, Science Fiction</td>\n",
580
+ " <td>rescue, future, spacecraft, race against time,...</td>\n",
581
+ " </tr>\n",
582
+ " <tr>\n",
583
+ " <th>2</th>\n",
584
+ " <td>155</td>\n",
585
+ " <td>The Dark Knight</td>\n",
586
+ " <td>Batman,raises,the,stakes,in,his,war,on,crime,W...</td>\n",
587
+ " <td>/qJ2tW6WMUDux911r6m7haRef0WH.jpg</td>\n",
588
+ " <td>Welcome to a world without rules.</td>\n",
589
+ " <td>Drama, Action, Crime, Thriller</td>\n",
590
+ " <td>joker, sadism, chaos, secret identity, crime f...</td>\n",
591
+ " </tr>\n",
592
+ " <tr>\n",
593
+ " <th>3</th>\n",
594
+ " <td>19995</td>\n",
595
+ " <td>Avatar</td>\n",
596
+ " <td>In,the,22nd,century,a,paraplegic,Marine,is,dis...</td>\n",
597
+ " <td>/kyeqWdyUXW608qlYkRqosgbbJyK.jpg</td>\n",
598
+ " <td>Enter the world of Pandora.</td>\n",
599
+ " <td>Action, Adventure, Fantasy, Science Fiction</td>\n",
600
+ " <td>future, society, culture clash, space travel, ...</td>\n",
601
+ " </tr>\n",
602
+ " <tr>\n",
603
+ " <th>4</th>\n",
604
+ " <td>24428</td>\n",
605
+ " <td>The Avengers</td>\n",
606
+ " <td>When,an,unexpected,enemy,emerges,and,threatens...</td>\n",
607
+ " <td>/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg</td>\n",
608
+ " <td>Some assembly required.</td>\n",
609
+ " <td>Science Fiction, Action, Adventure</td>\n",
610
+ " <td>new york city, superhero, shield, based on com...</td>\n",
611
+ " </tr>\n",
612
+ " <tr>\n",
613
+ " <th>...</th>\n",
614
+ " <td>...</td>\n",
615
+ " <td>...</td>\n",
616
+ " <td>...</td>\n",
617
+ " <td>...</td>\n",
618
+ " <td>...</td>\n",
619
+ " <td>...</td>\n",
620
+ " <td>...</td>\n",
621
+ " </tr>\n",
622
+ " <tr>\n",
623
+ " <th>1266856</th>\n",
624
+ " <td>796650</td>\n",
625
+ " <td>Survival of a Dragon</td>\n",
626
+ " <td>A,warrior,is,transported,1000,years,into,the,p...</td>\n",
627
+ " <td>/qX69cmtnyc9EfE3WN2UF3UI5Ta8.jpg</td>\n",
628
+ " <td>NaN</td>\n",
629
+ " <td>Adventure, Action, Fantasy</td>\n",
630
+ " <td>time travel</td>\n",
631
+ " </tr>\n",
632
+ " <tr>\n",
633
+ " <th>1266857</th>\n",
634
+ " <td>796651</td>\n",
635
+ " <td>A Model for the Motion of a Spring</td>\n",
636
+ " <td>The,utilization,of,numerical,oscillation,model...</td>\n",
637
+ " <td>/3s2vSewmxpdnzrWwPamkBbAM7mo.jpg</td>\n",
638
+ " <td>NaN</td>\n",
639
+ " <td>Fantasy</td>\n",
640
+ " <td>NaN</td>\n",
641
+ " </tr>\n",
642
+ " <tr>\n",
643
+ " <th>1266862</th>\n",
644
+ " <td>796657</td>\n",
645
+ " <td>Car Crash: Who's Lying?</td>\n",
646
+ " <td>Documentary,exploring,the,aftermath,of,a,car,c...</td>\n",
647
+ " <td>/yatOGyk6OZKtfZQlRIBnHYwfSHD.jpg</td>\n",
648
+ " <td>NaN</td>\n",
649
+ " <td>Documentary</td>\n",
650
+ " <td>NaN</td>\n",
651
+ " </tr>\n",
652
+ " <tr>\n",
653
+ " <th>1266863</th>\n",
654
+ " <td>796658</td>\n",
655
+ " <td>Cowboy</td>\n",
656
+ " <td>Having,grown,up,in,a,home,with,an,abusive,fath...</td>\n",
657
+ " <td>/fhPUn11H9wU0cGWLpo6WMcMXb9a.jpg</td>\n",
658
+ " <td>NaN</td>\n",
659
+ " <td>Documentary, Animation</td>\n",
660
+ " <td>NaN</td>\n",
661
+ " </tr>\n",
662
+ " <tr>\n",
663
+ " <th>1266864</th>\n",
664
+ " <td>1525030</td>\n",
665
+ " <td>Detective Kibbles: Part 2 | Remastered</td>\n",
666
+ " <td>After,a,brutal,attack,in,his,own,home,Detectiv...</td>\n",
667
+ " <td>/74SKZslJiJrO1nK4LnFa7ztesxZ.jpg</td>\n",
668
+ " <td>Every hero must have an end.</td>\n",
669
+ " <td>Crime, Comedy, Drama</td>\n",
670
+ " <td>police, detective, drama, kibbles</td>\n",
671
+ " </tr>\n",
672
+ " </tbody>\n",
673
+ "</table>\n",
674
+ "<p>991491 rows × 7 columns</p>\n",
675
+ "</div>"
676
+ ],
677
+ "text/plain": [
678
+ " id title \\\n",
679
+ "0 27205 Inception \n",
680
+ "1 157336 Interstellar \n",
681
+ "2 155 The Dark Knight \n",
682
+ "3 19995 Avatar \n",
683
+ "4 24428 The Avengers \n",
684
+ "... ... ... \n",
685
+ "1266856 796650 Survival of a Dragon \n",
686
+ "1266857 796651 A Model for the Motion of a Spring \n",
687
+ "1266862 796657 Car Crash: Who's Lying? \n",
688
+ "1266863 796658 Cowboy \n",
689
+ "1266864 1525030 Detective Kibbles: Part 2 | Remastered \n",
690
+ "\n",
691
+ " overview \\\n",
692
+ "0 Cobb,a,skilled,thief,who,commits,corporate,esp... \n",
693
+ "1 The,adventures,of,a,group,of,explorers,who,mak... \n",
694
+ "2 Batman,raises,the,stakes,in,his,war,on,crime,W... \n",
695
+ "3 In,the,22nd,century,a,paraplegic,Marine,is,dis... \n",
696
+ "4 When,an,unexpected,enemy,emerges,and,threatens... \n",
697
+ "... ... \n",
698
+ "1266856 A,warrior,is,transported,1000,years,into,the,p... \n",
699
+ "1266857 The,utilization,of,numerical,oscillation,model... \n",
700
+ "1266862 Documentary,exploring,the,aftermath,of,a,car,c... \n",
701
+ "1266863 Having,grown,up,in,a,home,with,an,abusive,fath... \n",
702
+ "1266864 After,a,brutal,attack,in,his,own,home,Detectiv... \n",
703
+ "\n",
704
+ " poster_path \\\n",
705
+ "0 /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg \n",
706
+ "1 /gEU2QniE6E77NI6lCU6MxlNBvIx.jpg \n",
707
+ "2 /qJ2tW6WMUDux911r6m7haRef0WH.jpg \n",
708
+ "3 /kyeqWdyUXW608qlYkRqosgbbJyK.jpg \n",
709
+ "4 /RYMX2wcKCBAr24UyPD7xwmjaTn.jpg \n",
710
+ "... ... \n",
711
+ "1266856 /qX69cmtnyc9EfE3WN2UF3UI5Ta8.jpg \n",
712
+ "1266857 /3s2vSewmxpdnzrWwPamkBbAM7mo.jpg \n",
713
+ "1266862 /yatOGyk6OZKtfZQlRIBnHYwfSHD.jpg \n",
714
+ "1266863 /fhPUn11H9wU0cGWLpo6WMcMXb9a.jpg \n",
715
+ "1266864 /74SKZslJiJrO1nK4LnFa7ztesxZ.jpg \n",
716
+ "\n",
717
+ " tagline \\\n",
718
+ "0 Your mind is the scene of the crime. \n",
719
+ "1 Mankind was born on Earth. It was never meant ... \n",
720
+ "2 Welcome to a world without rules. \n",
721
+ "3 Enter the world of Pandora. \n",
722
+ "4 Some assembly required. \n",
723
+ "... ... \n",
724
+ "1266856 NaN \n",
725
+ "1266857 NaN \n",
726
+ "1266862 NaN \n",
727
+ "1266863 NaN \n",
728
+ "1266864 Every hero must have an end. \n",
729
+ "\n",
730
+ " genres \\\n",
731
+ "0 Action, Science Fiction, Adventure \n",
732
+ "1 Adventure, Drama, Science Fiction \n",
733
+ "2 Drama, Action, Crime, Thriller \n",
734
+ "3 Action, Adventure, Fantasy, Science Fiction \n",
735
+ "4 Science Fiction, Action, Adventure \n",
736
+ "... ... \n",
737
+ "1266856 Adventure, Action, Fantasy \n",
738
+ "1266857 Fantasy \n",
739
+ "1266862 Documentary \n",
740
+ "1266863 Documentary, Animation \n",
741
+ "1266864 Crime, Comedy, Drama \n",
742
+ "\n",
743
+ " keywords \n",
744
+ "0 rescue, mission, dream, airplane, paris, franc... \n",
745
+ "1 rescue, future, spacecraft, race against time,... \n",
746
+ "2 joker, sadism, chaos, secret identity, crime f... \n",
747
+ "3 future, society, culture clash, space travel, ... \n",
748
+ "4 new york city, superhero, shield, based on com... \n",
749
+ "... ... \n",
750
+ "1266856 time travel \n",
751
+ "1266857 NaN \n",
752
+ "1266862 NaN \n",
753
+ "1266863 NaN \n",
754
+ "1266864 police, detective, drama, kibbles \n",
755
+ "\n",
756
+ "[991491 rows x 7 columns]"
757
+ ]
758
+ },
759
+ "execution_count": 10,
760
+ "metadata": {},
761
+ "output_type": "execute_result"
762
+ }
763
+ ],
764
+ "source": [
765
+ "movies"
766
+ ]
767
+ },
768
+ {
769
+ "cell_type": "code",
770
+ "execution_count": 11,
771
+ "id": "d9704c70",
772
+ "metadata": {},
773
+ "outputs": [
774
+ {
775
+ "name": "stderr",
776
+ "output_type": "stream",
777
+ "text": [
778
+ "C:\\Users\\unkno\\AppData\\Local\\Temp\\ipykernel_5356\\1367133233.py:1: SettingWithCopyWarning: \n",
779
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
780
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
781
+ "\n",
782
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
783
+ " movies['tagline'] = movies['tagline'].fillna(\"\")\n",
784
+ "C:\\Users\\unkno\\AppData\\Local\\Temp\\ipykernel_5356\\1367133233.py:2: SettingWithCopyWarning: \n",
785
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
786
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
787
+ "\n",
788
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
789
+ " movies['keywords'] = movies['keywords'].fillna(\"\")\n"
790
+ ]
791
+ }
792
+ ],
793
+ "source": [
794
+ "movies['tagline'] = movies['tagline'].fillna(\"\") \n",
795
+ "movies['keywords'] = movies['keywords'].fillna(\"\") "
796
+ ]
797
+ },
798
+ {
799
+ "cell_type": "code",
800
+ "execution_count": 12,
801
+ "id": "c4b3977b",
802
+ "metadata": {},
803
+ "outputs": [
804
+ {
805
+ "data": {
806
+ "text/html": [
807
+ "<div>\n",
808
+ "<style scoped>\n",
809
+ " .dataframe tbody tr th:only-of-type {\n",
810
+ " vertical-align: middle;\n",
811
+ " }\n",
812
+ "\n",
813
+ " .dataframe tbody tr th {\n",
814
+ " vertical-align: top;\n",
815
+ " }\n",
816
+ "\n",
817
+ " .dataframe thead th {\n",
818
+ " text-align: right;\n",
819
+ " }\n",
820
+ "</style>\n",
821
+ "<table border=\"1\" class=\"dataframe\">\n",
822
+ " <thead>\n",
823
+ " <tr style=\"text-align: right;\">\n",
824
+ " <th></th>\n",
825
+ " <th>id</th>\n",
826
+ " <th>title</th>\n",
827
+ " <th>overview</th>\n",
828
+ " <th>poster_path</th>\n",
829
+ " <th>tagline</th>\n",
830
+ " <th>genres</th>\n",
831
+ " <th>keywords</th>\n",
832
+ " </tr>\n",
833
+ " </thead>\n",
834
+ " <tbody>\n",
835
+ " <tr>\n",
836
+ " <th>0</th>\n",
837
+ " <td>27205</td>\n",
838
+ " <td>Inception</td>\n",
839
+ " <td>Cobb,a,skilled,thief,who,commits,corporate,esp...</td>\n",
840
+ " <td>/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg</td>\n",
841
+ " <td>Your mind is the scene of the crime.</td>\n",
842
+ " <td>Action, Science Fiction, Adventure</td>\n",
843
+ " <td>rescue, mission, dream, airplane, paris, franc...</td>\n",
844
+ " </tr>\n",
845
+ " <tr>\n",
846
+ " <th>1</th>\n",
847
+ " <td>157336</td>\n",
848
+ " <td>Interstellar</td>\n",
849
+ " <td>The,adventures,of,a,group,of,explorers,who,mak...</td>\n",
850
+ " <td>/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg</td>\n",
851
+ " <td>Mankind was born on Earth. It was never meant ...</td>\n",
852
+ " <td>Adventure, Drama, Science Fiction</td>\n",
853
+ " <td>rescue, future, spacecraft, race against time,...</td>\n",
854
+ " </tr>\n",
855
+ " <tr>\n",
856
+ " <th>2</th>\n",
857
+ " <td>155</td>\n",
858
+ " <td>The Dark Knight</td>\n",
859
+ " <td>Batman,raises,the,stakes,in,his,war,on,crime,W...</td>\n",
860
+ " <td>/qJ2tW6WMUDux911r6m7haRef0WH.jpg</td>\n",
861
+ " <td>Welcome to a world without rules.</td>\n",
862
+ " <td>Drama, Action, Crime, Thriller</td>\n",
863
+ " <td>joker, sadism, chaos, secret identity, crime f...</td>\n",
864
+ " </tr>\n",
865
+ " <tr>\n",
866
+ " <th>3</th>\n",
867
+ " <td>19995</td>\n",
868
+ " <td>Avatar</td>\n",
869
+ " <td>In,the,22nd,century,a,paraplegic,Marine,is,dis...</td>\n",
870
+ " <td>/kyeqWdyUXW608qlYkRqosgbbJyK.jpg</td>\n",
871
+ " <td>Enter the world of Pandora.</td>\n",
872
+ " <td>Action, Adventure, Fantasy, Science Fiction</td>\n",
873
+ " <td>future, society, culture clash, space travel, ...</td>\n",
874
+ " </tr>\n",
875
+ " <tr>\n",
876
+ " <th>4</th>\n",
877
+ " <td>24428</td>\n",
878
+ " <td>The Avengers</td>\n",
879
+ " <td>When,an,unexpected,enemy,emerges,and,threatens...</td>\n",
880
+ " <td>/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg</td>\n",
881
+ " <td>Some assembly required.</td>\n",
882
+ " <td>Science Fiction, Action, Adventure</td>\n",
883
+ " <td>new york city, superhero, shield, based on com...</td>\n",
884
+ " </tr>\n",
885
+ " <tr>\n",
886
+ " <th>...</th>\n",
887
+ " <td>...</td>\n",
888
+ " <td>...</td>\n",
889
+ " <td>...</td>\n",
890
+ " <td>...</td>\n",
891
+ " <td>...</td>\n",
892
+ " <td>...</td>\n",
893
+ " <td>...</td>\n",
894
+ " </tr>\n",
895
+ " <tr>\n",
896
+ " <th>1266856</th>\n",
897
+ " <td>796650</td>\n",
898
+ " <td>Survival of a Dragon</td>\n",
899
+ " <td>A,warrior,is,transported,1000,years,into,the,p...</td>\n",
900
+ " <td>/qX69cmtnyc9EfE3WN2UF3UI5Ta8.jpg</td>\n",
901
+ " <td></td>\n",
902
+ " <td>Adventure, Action, Fantasy</td>\n",
903
+ " <td>time travel</td>\n",
904
+ " </tr>\n",
905
+ " <tr>\n",
906
+ " <th>1266857</th>\n",
907
+ " <td>796651</td>\n",
908
+ " <td>A Model for the Motion of a Spring</td>\n",
909
+ " <td>The,utilization,of,numerical,oscillation,model...</td>\n",
910
+ " <td>/3s2vSewmxpdnzrWwPamkBbAM7mo.jpg</td>\n",
911
+ " <td></td>\n",
912
+ " <td>Fantasy</td>\n",
913
+ " <td></td>\n",
914
+ " </tr>\n",
915
+ " <tr>\n",
916
+ " <th>1266862</th>\n",
917
+ " <td>796657</td>\n",
918
+ " <td>Car Crash: Who's Lying?</td>\n",
919
+ " <td>Documentary,exploring,the,aftermath,of,a,car,c...</td>\n",
920
+ " <td>/yatOGyk6OZKtfZQlRIBnHYwfSHD.jpg</td>\n",
921
+ " <td></td>\n",
922
+ " <td>Documentary</td>\n",
923
+ " <td></td>\n",
924
+ " </tr>\n",
925
+ " <tr>\n",
926
+ " <th>1266863</th>\n",
927
+ " <td>796658</td>\n",
928
+ " <td>Cowboy</td>\n",
929
+ " <td>Having,grown,up,in,a,home,with,an,abusive,fath...</td>\n",
930
+ " <td>/fhPUn11H9wU0cGWLpo6WMcMXb9a.jpg</td>\n",
931
+ " <td></td>\n",
932
+ " <td>Documentary, Animation</td>\n",
933
+ " <td></td>\n",
934
+ " </tr>\n",
935
+ " <tr>\n",
936
+ " <th>1266864</th>\n",
937
+ " <td>1525030</td>\n",
938
+ " <td>Detective Kibbles: Part 2 | Remastered</td>\n",
939
+ " <td>After,a,brutal,attack,in,his,own,home,Detectiv...</td>\n",
940
+ " <td>/74SKZslJiJrO1nK4LnFa7ztesxZ.jpg</td>\n",
941
+ " <td>Every hero must have an end.</td>\n",
942
+ " <td>Crime, Comedy, Drama</td>\n",
943
+ " <td>police, detective, drama, kibbles</td>\n",
944
+ " </tr>\n",
945
+ " </tbody>\n",
946
+ "</table>\n",
947
+ "<p>991491 rows × 7 columns</p>\n",
948
+ "</div>"
949
+ ],
950
+ "text/plain": [
951
+ " id title \\\n",
952
+ "0 27205 Inception \n",
953
+ "1 157336 Interstellar \n",
954
+ "2 155 The Dark Knight \n",
955
+ "3 19995 Avatar \n",
956
+ "4 24428 The Avengers \n",
957
+ "... ... ... \n",
958
+ "1266856 796650 Survival of a Dragon \n",
959
+ "1266857 796651 A Model for the Motion of a Spring \n",
960
+ "1266862 796657 Car Crash: Who's Lying? \n",
961
+ "1266863 796658 Cowboy \n",
962
+ "1266864 1525030 Detective Kibbles: Part 2 | Remastered \n",
963
+ "\n",
964
+ " overview \\\n",
965
+ "0 Cobb,a,skilled,thief,who,commits,corporate,esp... \n",
966
+ "1 The,adventures,of,a,group,of,explorers,who,mak... \n",
967
+ "2 Batman,raises,the,stakes,in,his,war,on,crime,W... \n",
968
+ "3 In,the,22nd,century,a,paraplegic,Marine,is,dis... \n",
969
+ "4 When,an,unexpected,enemy,emerges,and,threatens... \n",
970
+ "... ... \n",
971
+ "1266856 A,warrior,is,transported,1000,years,into,the,p... \n",
972
+ "1266857 The,utilization,of,numerical,oscillation,model... \n",
973
+ "1266862 Documentary,exploring,the,aftermath,of,a,car,c... \n",
974
+ "1266863 Having,grown,up,in,a,home,with,an,abusive,fath... \n",
975
+ "1266864 After,a,brutal,attack,in,his,own,home,Detectiv... \n",
976
+ "\n",
977
+ " poster_path \\\n",
978
+ "0 /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg \n",
979
+ "1 /gEU2QniE6E77NI6lCU6MxlNBvIx.jpg \n",
980
+ "2 /qJ2tW6WMUDux911r6m7haRef0WH.jpg \n",
981
+ "3 /kyeqWdyUXW608qlYkRqosgbbJyK.jpg \n",
982
+ "4 /RYMX2wcKCBAr24UyPD7xwmjaTn.jpg \n",
983
+ "... ... \n",
984
+ "1266856 /qX69cmtnyc9EfE3WN2UF3UI5Ta8.jpg \n",
985
+ "1266857 /3s2vSewmxpdnzrWwPamkBbAM7mo.jpg \n",
986
+ "1266862 /yatOGyk6OZKtfZQlRIBnHYwfSHD.jpg \n",
987
+ "1266863 /fhPUn11H9wU0cGWLpo6WMcMXb9a.jpg \n",
988
+ "1266864 /74SKZslJiJrO1nK4LnFa7ztesxZ.jpg \n",
989
+ "\n",
990
+ " tagline \\\n",
991
+ "0 Your mind is the scene of the crime. \n",
992
+ "1 Mankind was born on Earth. It was never meant ... \n",
993
+ "2 Welcome to a world without rules. \n",
994
+ "3 Enter the world of Pandora. \n",
995
+ "4 Some assembly required. \n",
996
+ "... ... \n",
997
+ "1266856 \n",
998
+ "1266857 \n",
999
+ "1266862 \n",
1000
+ "1266863 \n",
1001
+ "1266864 Every hero must have an end. \n",
1002
+ "\n",
1003
+ " genres \\\n",
1004
+ "0 Action, Science Fiction, Adventure \n",
1005
+ "1 Adventure, Drama, Science Fiction \n",
1006
+ "2 Drama, Action, Crime, Thriller \n",
1007
+ "3 Action, Adventure, Fantasy, Science Fiction \n",
1008
+ "4 Science Fiction, Action, Adventure \n",
1009
+ "... ... \n",
1010
+ "1266856 Adventure, Action, Fantasy \n",
1011
+ "1266857 Fantasy \n",
1012
+ "1266862 Documentary \n",
1013
+ "1266863 Documentary, Animation \n",
1014
+ "1266864 Crime, Comedy, Drama \n",
1015
+ "\n",
1016
+ " keywords \n",
1017
+ "0 rescue, mission, dream, airplane, paris, franc... \n",
1018
+ "1 rescue, future, spacecraft, race against time,... \n",
1019
+ "2 joker, sadism, chaos, secret identity, crime f... \n",
1020
+ "3 future, society, culture clash, space travel, ... \n",
1021
+ "4 new york city, superhero, shield, based on com... \n",
1022
+ "... ... \n",
1023
+ "1266856 time travel \n",
1024
+ "1266857 \n",
1025
+ "1266862 \n",
1026
+ "1266863 \n",
1027
+ "1266864 police, detective, drama, kibbles \n",
1028
+ "\n",
1029
+ "[991491 rows x 7 columns]"
1030
+ ]
1031
+ },
1032
+ "execution_count": 12,
1033
+ "metadata": {},
1034
+ "output_type": "execute_result"
1035
+ }
1036
+ ],
1037
+ "source": [
1038
+ "movies"
1039
+ ]
1040
+ },
1041
+ {
1042
+ "cell_type": "code",
1043
+ "execution_count": 13,
1044
+ "id": "d68c1ec7",
1045
+ "metadata": {},
1046
+ "outputs": [
1047
+ {
1048
+ "name": "stderr",
1049
+ "output_type": "stream",
1050
+ "text": [
1051
+ "C:\\Users\\unkno\\AppData\\Local\\Temp\\ipykernel_5356\\3940781081.py:1: SettingWithCopyWarning: \n",
1052
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1053
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
1054
+ "\n",
1055
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
1056
+ " movies['encode_list'] = (\n"
1057
+ ]
1058
+ }
1059
+ ],
1060
+ "source": [
1061
+ "movies['encode_list'] = (\n",
1062
+ " movies['overview'].fillna('') + \" \" +\n",
1063
+ " movies['genres'].fillna('') + \" \" +\n",
1064
+ " movies['keywords'].fillna('')\n",
1065
+ ")\n"
1066
+ ]
1067
+ },
1068
+ {
1069
+ "cell_type": "code",
1070
+ "execution_count": 14,
1071
+ "id": "fa8d3290",
1072
+ "metadata": {},
1073
+ "outputs": [
1074
+ {
1075
+ "data": {
1076
+ "text/plain": [
1077
+ "0 Cobb,a,skilled,thief,who,commits,corporate,esp...\n",
1078
+ "1 The,adventures,of,a,group,of,explorers,who,mak...\n",
1079
+ "2 Batman,raises,the,stakes,in,his,war,on,crime,W...\n",
1080
+ "3 In,the,22nd,century,a,paraplegic,Marine,is,dis...\n",
1081
+ "4 When,an,unexpected,enemy,emerges,and,threatens...\n",
1082
+ " ... \n",
1083
+ "1266856 A,warrior,is,transported,1000,years,into,the,p...\n",
1084
+ "1266857 The,utilization,of,numerical,oscillation,model...\n",
1085
+ "1266862 Documentary,exploring,the,aftermath,of,a,car,c...\n",
1086
+ "1266863 Having,grown,up,in,a,home,with,an,abusive,fath...\n",
1087
+ "1266864 After,a,brutal,attack,in,his,own,home,Detectiv...\n",
1088
+ "Name: encode_list, Length: 991491, dtype: object"
1089
+ ]
1090
+ },
1091
+ "execution_count": 14,
1092
+ "metadata": {},
1093
+ "output_type": "execute_result"
1094
+ }
1095
+ ],
1096
+ "source": [
1097
+ "movies['encode_list']"
1098
+ ]
1099
+ },
1100
+ {
1101
+ "cell_type": "code",
1102
+ "execution_count": 15,
1103
+ "id": "3c2cc44f",
1104
+ "metadata": {},
1105
+ "outputs": [
1106
+ {
1107
+ "data": {
1108
+ "text/plain": [
1109
+ "0 Cobb,a,skilled,thief,who,commits,corporate,esp...\n",
1110
+ "Name: encode_list, dtype: object"
1111
+ ]
1112
+ },
1113
+ "execution_count": 15,
1114
+ "metadata": {},
1115
+ "output_type": "execute_result"
1116
+ }
1117
+ ],
1118
+ "source": [
1119
+ "movies['encode_list'].head(1)"
1120
+ ]
1121
+ },
1122
+ {
1123
+ "cell_type": "code",
1124
+ "execution_count": 16,
1125
+ "id": "5685e307",
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "name": "stdout",
1130
+ "output_type": "stream",
1131
+ "text": [
1132
+ "Defaulting to user installation because normal site-packages is not writeable\n",
1133
+ "Requirement already satisfied: nltk in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (3.9.1)\n",
1134
+ "Requirement already satisfied: click in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from nltk) (8.3.0)\n",
1135
+ "Requirement already satisfied: joblib in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from nltk) (1.5.2)\n",
1136
+ "Requirement already satisfied: regex>=2021.8.3 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from nltk) (2024.11.6)\n",
1137
+ "Requirement already satisfied: tqdm in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from nltk) (4.67.1)\n",
1138
+ "Requirement already satisfied: colorama in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from click->nltk) (0.4.6)\n"
1139
+ ]
1140
+ },
1141
+ {
1142
+ "name": "stderr",
1143
+ "output_type": "stream",
1144
+ "text": [
1145
+ "\n",
1146
+ "[notice] A new release of pip is available: 25.0.1 -> 25.2\n",
1147
+ "[notice] To update, run: C:\\Users\\unkno\\AppData\\Local\\Microsoft\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\python.exe -m pip install --upgrade pip\n"
1148
+ ]
1149
+ }
1150
+ ],
1151
+ "source": [
1152
+ "!pip install nltk"
1153
+ ]
1154
+ },
1155
+ {
1156
+ "cell_type": "code",
1157
+ "execution_count": 17,
1158
+ "id": "e7182aa4",
1159
+ "metadata": {},
1160
+ "outputs": [
1161
+ {
1162
+ "name": "stderr",
1163
+ "output_type": "stream",
1164
+ "text": [
1165
+ "C:\\Users\\unkno\\AppData\\Local\\Temp\\ipykernel_5356\\3196361095.py:9: SettingWithCopyWarning: \n",
1166
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1167
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
1168
+ "\n",
1169
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
1170
+ " movies['encode_list'] = movies['encode_list'].map(stem)\n"
1171
+ ]
1172
+ }
1173
+ ],
1174
+ "source": [
1175
+ "from nltk.stem.porter import PorterStemmer\n",
1176
+ "\n",
1177
+ "ps = PorterStemmer()\n",
1178
+ "\n",
1179
+ "def stem(text):\n",
1180
+ " return \" \".join(ps.stem(word) for word in text.split(\",\"))\n",
1181
+ " \n",
1182
+ "# Use 'map' instead of 'apply' for faster operation\n",
1183
+ "movies['encode_list'] = movies['encode_list'].map(stem)"
1184
+ ]
1185
+ },
1186
+ {
1187
+ "cell_type": "code",
1188
+ "execution_count": 18,
1189
+ "id": "eeca71ce",
1190
+ "metadata": {},
1191
+ "outputs": [],
1192
+ "source": [
1193
+ "new_movies = movies[['id', 'title', 'poster_path', 'encode_list']]"
1194
+ ]
1195
+ },
1196
+ {
1197
+ "cell_type": "code",
1198
+ "execution_count": 19,
1199
+ "id": "82157fde",
1200
+ "metadata": {},
1201
+ "outputs": [
1202
+ {
1203
+ "data": {
1204
+ "text/html": [
1205
+ "<div>\n",
1206
+ "<style scoped>\n",
1207
+ " .dataframe tbody tr th:only-of-type {\n",
1208
+ " vertical-align: middle;\n",
1209
+ " }\n",
1210
+ "\n",
1211
+ " .dataframe tbody tr th {\n",
1212
+ " vertical-align: top;\n",
1213
+ " }\n",
1214
+ "\n",
1215
+ " .dataframe thead th {\n",
1216
+ " text-align: right;\n",
1217
+ " }\n",
1218
+ "</style>\n",
1219
+ "<table border=\"1\" class=\"dataframe\">\n",
1220
+ " <thead>\n",
1221
+ " <tr style=\"text-align: right;\">\n",
1222
+ " <th></th>\n",
1223
+ " <th>id</th>\n",
1224
+ " <th>title</th>\n",
1225
+ " <th>poster_path</th>\n",
1226
+ " <th>encode_list</th>\n",
1227
+ " </tr>\n",
1228
+ " </thead>\n",
1229
+ " <tbody>\n",
1230
+ " <tr>\n",
1231
+ " <th>0</th>\n",
1232
+ " <td>27205</td>\n",
1233
+ " <td>Inception</td>\n",
1234
+ " <td>/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg</td>\n",
1235
+ " <td>cobb a skill thief who commit corpor espionag ...</td>\n",
1236
+ " </tr>\n",
1237
+ " <tr>\n",
1238
+ " <th>1</th>\n",
1239
+ " <td>157336</td>\n",
1240
+ " <td>Interstellar</td>\n",
1241
+ " <td>/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg</td>\n",
1242
+ " <td>the adventur of a group of explor who make use...</td>\n",
1243
+ " </tr>\n",
1244
+ " <tr>\n",
1245
+ " <th>2</th>\n",
1246
+ " <td>155</td>\n",
1247
+ " <td>The Dark Knight</td>\n",
1248
+ " <td>/qJ2tW6WMUDux911r6m7haRef0WH.jpg</td>\n",
1249
+ " <td>batman rais the stake in hi war on crime with ...</td>\n",
1250
+ " </tr>\n",
1251
+ " <tr>\n",
1252
+ " <th>3</th>\n",
1253
+ " <td>19995</td>\n",
1254
+ " <td>Avatar</td>\n",
1255
+ " <td>/kyeqWdyUXW608qlYkRqosgbbJyK.jpg</td>\n",
1256
+ " <td>in the 22nd centuri a parapleg marin is dispat...</td>\n",
1257
+ " </tr>\n",
1258
+ " <tr>\n",
1259
+ " <th>4</th>\n",
1260
+ " <td>24428</td>\n",
1261
+ " <td>The Avengers</td>\n",
1262
+ " <td>/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg</td>\n",
1263
+ " <td>when an unexpect enemi emerg and threaten glob...</td>\n",
1264
+ " </tr>\n",
1265
+ " </tbody>\n",
1266
+ "</table>\n",
1267
+ "</div>"
1268
+ ],
1269
+ "text/plain": [
1270
+ " id title poster_path \\\n",
1271
+ "0 27205 Inception /oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg \n",
1272
+ "1 157336 Interstellar /gEU2QniE6E77NI6lCU6MxlNBvIx.jpg \n",
1273
+ "2 155 The Dark Knight /qJ2tW6WMUDux911r6m7haRef0WH.jpg \n",
1274
+ "3 19995 Avatar /kyeqWdyUXW608qlYkRqosgbbJyK.jpg \n",
1275
+ "4 24428 The Avengers /RYMX2wcKCBAr24UyPD7xwmjaTn.jpg \n",
1276
+ "\n",
1277
+ " encode_list \n",
1278
+ "0 cobb a skill thief who commit corpor espionag ... \n",
1279
+ "1 the adventur of a group of explor who make use... \n",
1280
+ "2 batman rais the stake in hi war on crime with ... \n",
1281
+ "3 in the 22nd centuri a parapleg marin is dispat... \n",
1282
+ "4 when an unexpect enemi emerg and threaten glob... "
1283
+ ]
1284
+ },
1285
+ "execution_count": 19,
1286
+ "metadata": {},
1287
+ "output_type": "execute_result"
1288
+ }
1289
+ ],
1290
+ "source": [
1291
+ "new_movies.head(5)"
1292
+ ]
1293
+ },
1294
+ {
1295
+ "cell_type": "code",
1296
+ "execution_count": 20,
1297
+ "id": "e15ed18b",
1298
+ "metadata": {},
1299
+ "outputs": [
1300
+ {
1301
+ "name": "stdout",
1302
+ "output_type": "stream",
1303
+ "text": [
1304
+ "Defaulting to user installation because normal site-packages is not writeable\n",
1305
+ "Requirement already satisfied: scikit-learn in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (1.7.2)\n",
1306
+ "Requirement already satisfied: numpy>=1.22.0 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from scikit-learn) (2.2.4)\n",
1307
+ "Requirement already satisfied: scipy>=1.8.0 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from scikit-learn) (1.16.2)\n",
1308
+ "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from scikit-learn) (1.5.2)\n",
1309
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (from scikit-learn) (3.6.0)\n"
1310
+ ]
1311
+ },
1312
+ {
1313
+ "name": "stderr",
1314
+ "output_type": "stream",
1315
+ "text": [
1316
+ "\n",
1317
+ "[notice] A new release of pip is available: 25.0.1 -> 25.2\n",
1318
+ "[notice] To update, run: C:\\Users\\unkno\\AppData\\Local\\Microsoft\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\python.exe -m pip install --upgrade pip\n"
1319
+ ]
1320
+ }
1321
+ ],
1322
+ "source": [
1323
+ "!pip install scikit-learn\n",
1324
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
1325
+ "vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')\n"
1326
+ ]
1327
+ },
1328
+ {
1329
+ "cell_type": "code",
1330
+ "execution_count": 21,
1331
+ "id": "634f446d",
1332
+ "metadata": {},
1333
+ "outputs": [],
1334
+ "source": [
1335
+ "features = vectorizer.fit_transform(new_movies['encode_list']).toarray()"
1336
+ ]
1337
+ },
1338
+ {
1339
+ "cell_type": "code",
1340
+ "execution_count": 22,
1341
+ "id": "d1f0fdac",
1342
+ "metadata": {},
1343
+ "outputs": [
1344
+ {
1345
+ "data": {
1346
+ "text/plain": [
1347
+ "array([[0., 0., 0., ..., 0., 0., 0.],\n",
1348
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
1349
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
1350
+ " ...,\n",
1351
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
1352
+ " [0., 0., 0., ..., 0., 0., 0.],\n",
1353
+ " [0., 0., 0., ..., 0., 0., 0.]], shape=(991491, 1000))"
1354
+ ]
1355
+ },
1356
+ "execution_count": 22,
1357
+ "metadata": {},
1358
+ "output_type": "execute_result"
1359
+ }
1360
+ ],
1361
+ "source": [
1362
+ "features"
1363
+ ]
1364
+ },
1365
+ {
1366
+ "cell_type": "code",
1367
+ "execution_count": 23,
1368
+ "id": "1ca6a500",
1369
+ "metadata": {},
1370
+ "outputs": [],
1371
+ "source": [
1372
+ "# Skip FAISS - use sklearn only with limited data\n",
1373
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1374
+ "\n",
1375
+ "# Your features is already a numpy array - use it directly\n",
1376
+ "feature_array = features.astype('float32')\n",
1377
+ "\n",
1378
+ "def get_movie_title(movie_id):\n",
1379
+ " return new_movies.iloc[movie_id]['title']\n",
1380
+ "\n",
1381
+ "def recommend(movie_id, top_k=5):\n",
1382
+ " # Get the feature vector for the input movie\n",
1383
+ " movie_vector = feature_array[movie_id].reshape(1, -1)\n",
1384
+ " \n",
1385
+ " # Calculate similarity only for this movie vs all others\n",
1386
+ " similarity_scores = cosine_similarity(movie_vector, feature_array).flatten()\n",
1387
+ " \n",
1388
+ " # Get top similar movies (exclude the movie itself)\n",
1389
+ " similar_indices = similarity_scores.argsort()[-top_k-1:-1][::-1]\n",
1390
+ " \n",
1391
+ " return similar_indices\n"
1392
+ ]
1393
+ },
1394
+ {
1395
+ "cell_type": "code",
1396
+ "execution_count": 31,
1397
+ "id": "c425f84f",
1398
+ "metadata": {},
1399
+ "outputs": [
1400
+ {
1401
+ "name": "stdout",
1402
+ "output_type": "stream",
1403
+ "text": [
1404
+ "Interstellar\n",
1405
+ "Time and Space\n",
1406
+ "A Monster From Space\n",
1407
+ "Budhayaan\n",
1408
+ "Manhunt in Space\n",
1409
+ "Journey to Space\n"
1410
+ ]
1411
+ }
1412
+ ],
1413
+ "source": [
1414
+ "print(get_movie_title(1))\n",
1415
+ "\n",
1416
+ "recommended_movies = recommend(1) # Get recommendations for movie at index 0\n",
1417
+ "for mv in recommended_movies:\n",
1418
+ " print(get_movie_title(mv))"
1419
+ ]
1420
+ },
1421
+ {
1422
+ "cell_type": "code",
1423
+ "execution_count": 26,
1424
+ "id": "65432cd8",
1425
+ "metadata": {},
1426
+ "outputs": [],
1427
+ "source": [
1428
+ "import numpy as np\n",
1429
+ "import pandas as pd\n",
1430
+ "\n",
1431
+ "# Save compressed numpy array\n",
1432
+ "np.savez_compressed('feature_array.npz', feature_array)\n",
1433
+ "\n"
1434
+ ]
1435
+ },
1436
+ {
1437
+ "cell_type": "code",
1438
+ "execution_count": 28,
1439
+ "id": "2139293d",
1440
+ "metadata": {},
1441
+ "outputs": [],
1442
+ "source": [
1443
+ "import pandas as pd\n",
1444
+ "\n",
1445
+ "# Remove any extension type columns\n",
1446
+ "safe_new_movies = new_movies.select_dtypes(include=['int64', 'float64', 'object']).copy()\n",
1447
+ "\n",
1448
+ "# Convert all columns to string as a fallback if extension issues persist\n",
1449
+ "for col in safe_new_movies.columns:\n",
1450
+ " safe_new_movies[col] = safe_new_movies[col].astype(str)\n",
1451
+ "\n",
1452
+ "safe_new_movies.reset_index(drop=True, inplace=True)\n",
1453
+ "\n",
1454
+ "# Now save as parquet\n",
1455
+ "safe_new_movies.to_csv('movies_df.csv', index=False)\n"
1456
+ ]
1457
+ },
1458
+ {
1459
+ "cell_type": "code",
1460
+ "execution_count": 33,
1461
+ "id": "d0eb30ec",
1462
+ "metadata": {},
1463
+ "outputs": [],
1464
+ "source": [
1465
+ "import numpy as np\n",
1466
+ "import pandas as pd\n",
1467
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
1468
+ "\n",
1469
+ "# Load compressed numpy feature array and movie data\n",
1470
+ "feature_array = np.load('feature_array.npz')['arr_0']\n",
1471
+ "new_movies = pd.read_csv('movies_df.csv')\n",
1472
+ "\n",
1473
+ "def get_movie_title(movie_id):\n",
1474
+ " return new_movies.iloc[movie_id]['title']\n",
1475
+ "\n",
1476
+ "def recommend(movie_id, top_k=5):\n",
1477
+ " movie_vector = feature_array[movie_id].reshape(1, -1)\n",
1478
+ " similarity_scores = cosine_similarity(movie_vector, feature_array).flatten()\n",
1479
+ " similar_indices = similarity_scores.argsort()[-top_k-1:-1][::-1] # Exclude itself\n",
1480
+ " return similar_indices\n"
1481
+ ]
1482
+ },
1483
+ {
1484
+ "cell_type": "code",
1485
+ "execution_count": 34,
1486
+ "id": "c6e82e65",
1487
+ "metadata": {},
1488
+ "outputs": [
1489
+ {
1490
+ "name": "stdout",
1491
+ "output_type": "stream",
1492
+ "text": [
1493
+ "Time and Space\n",
1494
+ "A Monster From Space\n",
1495
+ "Budhayaan\n",
1496
+ "Manhunt in Space\n",
1497
+ "Journey to Space\n"
1498
+ ]
1499
+ }
1500
+ ],
1501
+ "source": [
1502
+ "recommended_movies = recommend(1)\n",
1503
+ "for mv in recommended_movies:\n",
1504
+ " print(get_movie_title(mv))"
1505
+ ]
1506
+ },
1507
+ {
1508
+ "cell_type": "code",
1509
+ "execution_count": 1,
1510
+ "id": "a1aa9f21",
1511
+ "metadata": {},
1512
+ "outputs": [
1513
+ {
1514
+ "name": "stdout",
1515
+ "output_type": "stream",
1516
+ "text": [
1517
+ "Defaulting to user installation because normal site-packages is not writeable\n",
1518
+ "Requirement already satisfied: annoy in c:\\users\\unkno\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.13_qbz5n2kfra8p0\\localcache\\local-packages\\python313\\site-packages (1.17.3)\n"
1519
+ ]
1520
+ },
1521
+ {
1522
+ "name": "stderr",
1523
+ "output_type": "stream",
1524
+ "text": [
1525
+ "\n",
1526
+ "[notice] A new release of pip is available: 25.0.1 -> 25.2\n",
1527
+ "[notice] To update, run: C:\\Users\\unkno\\AppData\\Local\\Microsoft\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\python.exe -m pip install --upgrade pip\n"
1528
+ ]
1529
+ },
1530
+ {
1531
+ "name": "stdout",
1532
+ "output_type": "stream",
1533
+ "text": [
1534
+ "Time and Space\n",
1535
+ "A Monster From Space\n",
1536
+ "Budhayaan\n",
1537
+ "Manhunt in Space\n",
1538
+ "Journey to Space\n"
1539
+ ]
1540
+ }
1541
+ ],
1542
+ "source": [
1543
+ "!pip install annoy\n",
1544
+ "\n",
1545
+ "from annoy import AnnoyIndex\n",
1546
+ "import numpy as np\n",
1547
+ "import pandas as pd\n",
1548
+ "\n",
1549
+ "# Load data\n",
1550
+ "feature_array = np.load('feature_array.npz')['arr_0']\n",
1551
+ "new_movies = pd.read_csv('movies_df.csv')\n",
1552
+ "f = feature_array.shape[1]\n",
1553
+ "annoy_index = AnnoyIndex(f, 'angular')\n",
1554
+ "\n",
1555
+ "# Build Annoy index (run this only once, it takes some time—then it's super fast)\n",
1556
+ "for i in range(feature_array.shape[0]):\n",
1557
+ " annoy_index.add_item(i, feature_array[i])\n",
1558
+ "annoy_index.build(10)\n",
1559
+ "\n",
1560
+ "def get_movie_title(movie_id):\n",
1561
+ " return new_movies.iloc[movie_id]['title']\n",
1562
+ "\n",
1563
+ "def recommend_fast(movie_id, top_k=5):\n",
1564
+ " idxs = annoy_index.get_nns_by_item(movie_id, top_k+1)[1:] # skip itself\n",
1565
+ " return idxs\n",
1566
+ "\n",
1567
+ "# Example usage\n",
1568
+ "recommended_movies = recommend_fast(1)\n",
1569
+ "for mv in recommended_movies:\n",
1570
+ " print(get_movie_title(mv))\n"
1571
+ ]
1572
+ },
1573
+ {
1574
+ "cell_type": "code",
1575
+ "execution_count": 2,
1576
+ "id": "26664759",
1577
+ "metadata": {},
1578
+ "outputs": [
1579
+ {
1580
+ "name": "stdout",
1581
+ "output_type": "stream",
1582
+ "text": [
1583
+ "Time and Space\n",
1584
+ "A Monster From Space\n",
1585
+ "Budhayaan\n",
1586
+ "Manhunt in Space\n",
1587
+ "Journey to Space\n"
1588
+ ]
1589
+ }
1590
+ ],
1591
+ "source": [
1592
+ "recommended_movies = recommend_fast(1)\n",
1593
+ "for mv in recommended_movies:\n",
1594
+ " print(get_movie_title(mv))"
1595
+ ]
1596
+ },
1597
+ {
1598
+ "cell_type": "code",
1599
+ "execution_count": 3,
1600
+ "id": "f7be3f08",
1601
+ "metadata": {},
1602
+ "outputs": [
1603
+ {
1604
+ "name": "stdout",
1605
+ "output_type": "stream",
1606
+ "text": [
1607
+ "Hunger Games: Katniss & Rue\n",
1608
+ "Offside - an Unequal Game\n",
1609
+ "bottomofthe9th\n",
1610
+ "Ádám Wilds - The Movie\n",
1611
+ "Pi Pong\n"
1612
+ ]
1613
+ }
1614
+ ],
1615
+ "source": [
1616
+ "recommended_movies = recommend_fast(200)\n",
1617
+ "for mv in recommended_movies:\n",
1618
+ " print(get_movie_title(mv))"
1619
+ ]
1620
+ },
1621
+ {
1622
+ "cell_type": "code",
1623
+ "execution_count": 4,
1624
+ "id": "adda9d00",
1625
+ "metadata": {},
1626
+ "outputs": [
1627
+ {
1628
+ "name": "stdout",
1629
+ "output_type": "stream",
1630
+ "text": [
1631
+ "3.851896818727255 GB\n"
1632
+ ]
1633
+ }
1634
+ ],
1635
+ "source": [
1636
+ "annoy_index.save('my_index.ann')\n",
1637
+ "\n",
1638
+ "import os\n",
1639
+ "print(os.path.getsize('my_index.ann') / (1024**3), \"GB\")\n"
1640
+ ]
1641
+ },
1642
+ {
1643
+ "cell_type": "code",
1644
+ "execution_count": null,
1645
+ "id": "1f34ea24",
1646
+ "metadata": {},
1647
+ "outputs": [],
1648
+ "source": [
1649
+ "\n"
1650
+ ]
1651
+ }
1652
+ ],
1653
+ "metadata": {
1654
+ "kernelspec": {
1655
+ "display_name": "Python 3",
1656
+ "language": "python",
1657
+ "name": "python3"
1658
+ },
1659
+ "language_info": {
1660
+ "codemirror_mode": {
1661
+ "name": "ipython",
1662
+ "version": 3
1663
+ },
1664
+ "file_extension": ".py",
1665
+ "mimetype": "text/x-python",
1666
+ "name": "python",
1667
+ "nbconvert_exporter": "python",
1668
+ "pygments_lexer": "ipython3",
1669
+ "version": "3.13.9"
1670
+ }
1671
+ },
1672
+ "nbformat": 4,
1673
+ "nbformat_minor": 5
1674
+ }
movies_df.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ed8378723b509ba7c779ce6c495a32a3d6605a12d51fb9a1821c55aea2a808
3
+ size 309075548
my_index.ann ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51716a2ec84add79327be429a40daed697255f555ec3e1b1193c959dfbdf214f
3
+ size 4135942716