NishantD commited on
Commit
86f930d
·
verified ·
1 Parent(s): e8c0f75

Upload 6 files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. app.py +68 -0
  3. movies.pkl +3 -0
  4. similarity.pkl +3 -0
  5. tmdb_5000_credits.csv +3 -0
  6. tmdb_5000_movies.csv +0 -0
  7. train.ipynb +878 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tmdb_5000_credits.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import pandas as pd
4
+ import requests
5
+
6
+ api_key = 'd9f397a605c439a3b316dc3492e286c2'
7
+
8
+
9
+ movies_dict = pickle.load(open('movies.pkl', 'rb')) # open the file in read mode
10
+ movies = pd.DataFrame(movies_dict)
11
+
12
+ st.title('Movie Recommender System')
13
+ selected_movie_name = st.selectbox('Select a movie:', movies['title'].values)
14
+
15
+ similarity = pickle.load(open('similarity.pkl', 'rb'))
16
+
17
+ def fetch_poster(movie_id):
18
+ response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}')
19
+ data = response.json()
20
+ return "https://image.tmdb.org/t/p/w500/" + data['poster_path']
21
+
22
+ def recommend(movie):
23
+ movie_index = movies[movies['title'] == movie].index[0]
24
+ distances = similarity[movie_index]
25
+ movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
26
+
27
+ recommend_movies = []
28
+ recomended_movies_posters = []
29
+
30
+ for i in movies_list:
31
+ movie_id = movies.iloc[i[0]].movie_id
32
+ recommend_movies.append(movies.iloc[i[0]].title)
33
+ #fetch poster from API
34
+ poster = fetch_poster(movie_id)
35
+ recomended_movies_posters.append(poster)
36
+
37
+ return recommend_movies, recomended_movies_posters
38
+
39
+
40
+ if st.button('Recommend'):
41
+ st.write('You have selected:', selected_movie_name)
42
+ recommendations, posters = recommend(selected_movie_name)
43
+ st.write('Recommendations are : ')
44
+
45
+ col1, col2, col3, col4, col5 = st.columns(5)
46
+ with col1:
47
+ st.text(recommendations[0])
48
+ st.image(posters[0])
49
+
50
+ with col2:
51
+ st.text(recommendations[1])
52
+ st.image(posters[1])
53
+
54
+ with col3:
55
+ st.text(recommendations[2])
56
+ st.image(posters[2])
57
+
58
+ with col4:
59
+ st.text(recommendations[3])
60
+ st.image(posters[3])
61
+
62
+ with col5:
63
+ st.text(recommendations[4])
64
+ st.image(posters[4])
65
+
66
+
67
+
68
+
movies.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b55dc3dafba86e695ca5c37287039ab38c547b993bb337304812c1cfc80ac3b
3
+ size 2216684
similarity.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab55d4e4c93cf300f38c25699e36ff1c694add725f3940279b4775e941f42e98
3
+ size 184781251
tmdb_5000_credits.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d0050599ff88d40366c4841204b1489862bca346bfa46c20b05a65d14508435
3
+ size 40044293
tmdb_5000_movies.csv ADDED
The diff for this file is too large to render. See raw diff
 
train.ipynb ADDED
@@ -0,0 +1,878 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 46,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "import pandas as pd"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 47,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "movies = pd.read_csv('tmdb_5000_movies.csv')\n",
20
+ "credits = pd.read_csv('tmdb_5000_credits.csv')"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 48,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/plain": [
31
+ "(Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',\n",
32
+ " 'original_title', 'overview', 'popularity', 'production_companies',\n",
33
+ " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
34
+ " 'spoken_languages', 'status', 'tagline', 'title', 'vote_average',\n",
35
+ " 'vote_count'],\n",
36
+ " dtype='object'),\n",
37
+ " Index(['movie_id', 'title', 'cast', 'crew'], dtype='object'))"
38
+ ]
39
+ },
40
+ "execution_count": 48,
41
+ "metadata": {},
42
+ "output_type": "execute_result"
43
+ }
44
+ ],
45
+ "source": [
46
+ "movies.columns, credits.columns"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 49,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "# title is same in both datasets\n",
56
+ "movies = movies.merge(credits, on='title')"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 50,
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "data": {
66
+ "text/html": [
67
+ "<div>\n",
68
+ "<style scoped>\n",
69
+ " .dataframe tbody tr th:only-of-type {\n",
70
+ " vertical-align: middle;\n",
71
+ " }\n",
72
+ "\n",
73
+ " .dataframe tbody tr th {\n",
74
+ " vertical-align: top;\n",
75
+ " }\n",
76
+ "\n",
77
+ " .dataframe thead th {\n",
78
+ " text-align: right;\n",
79
+ " }\n",
80
+ "</style>\n",
81
+ "<table border=\"1\" class=\"dataframe\">\n",
82
+ " <thead>\n",
83
+ " <tr style=\"text-align: right;\">\n",
84
+ " <th></th>\n",
85
+ " <th>budget</th>\n",
86
+ " <th>genres</th>\n",
87
+ " <th>homepage</th>\n",
88
+ " <th>id</th>\n",
89
+ " <th>keywords</th>\n",
90
+ " <th>original_language</th>\n",
91
+ " <th>original_title</th>\n",
92
+ " <th>overview</th>\n",
93
+ " <th>popularity</th>\n",
94
+ " <th>production_companies</th>\n",
95
+ " <th>...</th>\n",
96
+ " <th>runtime</th>\n",
97
+ " <th>spoken_languages</th>\n",
98
+ " <th>status</th>\n",
99
+ " <th>tagline</th>\n",
100
+ " <th>title</th>\n",
101
+ " <th>vote_average</th>\n",
102
+ " <th>vote_count</th>\n",
103
+ " <th>movie_id</th>\n",
104
+ " <th>cast</th>\n",
105
+ " <th>crew</th>\n",
106
+ " </tr>\n",
107
+ " </thead>\n",
108
+ " <tbody>\n",
109
+ " <tr>\n",
110
+ " <th>0</th>\n",
111
+ " <td>237000000</td>\n",
112
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
113
+ " <td>http://www.avatarmovie.com/</td>\n",
114
+ " <td>19995</td>\n",
115
+ " <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
116
+ " <td>en</td>\n",
117
+ " <td>Avatar</td>\n",
118
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
119
+ " <td>150.437577</td>\n",
120
+ " <td>[{\"name\": \"Ingenious Film Partners\", \"id\": 289...</td>\n",
121
+ " <td>...</td>\n",
122
+ " <td>162.0</td>\n",
123
+ " <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...</td>\n",
124
+ " <td>Released</td>\n",
125
+ " <td>Enter the World of Pandora.</td>\n",
126
+ " <td>Avatar</td>\n",
127
+ " <td>7.2</td>\n",
128
+ " <td>11800</td>\n",
129
+ " <td>19995</td>\n",
130
+ " <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
131
+ " <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
132
+ " </tr>\n",
133
+ " </tbody>\n",
134
+ "</table>\n",
135
+ "<p>1 rows × 23 columns</p>\n",
136
+ "</div>"
137
+ ],
138
+ "text/plain": [
139
+ " budget genres \\\n",
140
+ "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n",
141
+ "\n",
142
+ " homepage id \\\n",
143
+ "0 http://www.avatarmovie.com/ 19995 \n",
144
+ "\n",
145
+ " keywords original_language \\\n",
146
+ "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n",
147
+ "\n",
148
+ " original_title overview \\\n",
149
+ "0 Avatar In the 22nd century, a paraplegic Marine is di... \n",
150
+ "\n",
151
+ " popularity production_companies ... runtime \\\n",
152
+ "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... 162.0 \n",
153
+ "\n",
154
+ " spoken_languages status \\\n",
155
+ "0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n",
156
+ "\n",
157
+ " tagline title vote_average vote_count movie_id \\\n",
158
+ "0 Enter the World of Pandora. Avatar 7.2 11800 19995 \n",
159
+ "\n",
160
+ " cast \\\n",
161
+ "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \n",
162
+ "\n",
163
+ " crew \n",
164
+ "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n",
165
+ "\n",
166
+ "[1 rows x 23 columns]"
167
+ ]
168
+ },
169
+ "execution_count": 50,
170
+ "metadata": {},
171
+ "output_type": "execute_result"
172
+ }
173
+ ],
174
+ "source": [
175
+ "movies.head(1)"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 51,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "# selecting columns\n",
185
+ "\n",
186
+ "movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 52,
192
+ "metadata": {},
193
+ "outputs": [
194
+ {
195
+ "name": "stdout",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "<class 'pandas.core.frame.DataFrame'>\n",
199
+ "RangeIndex: 4809 entries, 0 to 4808\n",
200
+ "Data columns (total 7 columns):\n",
201
+ " # Column Non-Null Count Dtype \n",
202
+ "--- ------ -------------- ----- \n",
203
+ " 0 movie_id 4809 non-null int64 \n",
204
+ " 1 title 4809 non-null object\n",
205
+ " 2 overview 4806 non-null object\n",
206
+ " 3 genres 4809 non-null object\n",
207
+ " 4 keywords 4809 non-null object\n",
208
+ " 5 cast 4809 non-null object\n",
209
+ " 6 crew 4809 non-null object\n",
210
+ "dtypes: int64(1), object(6)\n",
211
+ "memory usage: 263.1+ KB\n"
212
+ ]
213
+ }
214
+ ],
215
+ "source": [
216
+ "movies.info()"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 53,
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "movies.dropna(inplace=True)"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 54,
231
+ "metadata": {},
232
+ "outputs": [
233
+ {
234
+ "data": {
235
+ "text/plain": [
236
+ "0"
237
+ ]
238
+ },
239
+ "execution_count": 54,
240
+ "metadata": {},
241
+ "output_type": "execute_result"
242
+ }
243
+ ],
244
+ "source": [
245
+ "movies.duplicated().sum()"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 55,
251
+ "metadata": {},
252
+ "outputs": [
253
+ {
254
+ "data": {
255
+ "text/plain": [
256
+ "'[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"name\": \"Fantasy\"}, {\"id\": 878, \"name\": \"Science Fiction\"}]'"
257
+ ]
258
+ },
259
+ "execution_count": 55,
260
+ "metadata": {},
261
+ "output_type": "execute_result"
262
+ }
263
+ ],
264
+ "source": [
265
+ "movies.iloc[0].genres"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 56,
271
+ "metadata": {},
272
+ "outputs": [],
273
+ "source": [
274
+ "import ast \n",
275
+ "def conversion(obj):\n",
276
+ " l = []\n",
277
+ " for i in ast.literal_eval(obj):\n",
278
+ " l.append(i['name'])\n",
279
+ " return l\n"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 57,
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "movies['genres'] = movies['genres'].apply(conversion)\n",
289
+ "movies['keywords'] = movies['keywords'].apply(conversion)"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 58,
295
+ "metadata": {},
296
+ "outputs": [],
297
+ "source": [
298
+ "# for cast, we need to get top 3 actors\n",
299
+ "def conversion(obj):\n",
300
+ " l = []\n",
301
+ " counter = 0\n",
302
+ " for i in ast.literal_eval(obj):\n",
303
+ " if counter != 3:\n",
304
+ " l.append(i['name'])\n",
305
+ " counter += 1\n",
306
+ " else:\n",
307
+ " break\n",
308
+ " return l"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": 59,
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "movies['cast'] = movies['cast'].apply(conversion)"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": 60,
323
+ "metadata": {},
324
+ "outputs": [],
325
+ "source": [
326
+ "def fetch_director(obj):\n",
327
+ " l = []\n",
328
+ " for i in ast.literal_eval(obj):\n",
329
+ " if i['job'] == 'Director': \n",
330
+ " l.append(i['name'])\n",
331
+ " break\n",
332
+ " return l"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 61,
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "movies['crew'] = movies['crew'].apply(fetch_director)"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 63,
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "#converting overview into string \n",
351
+ "movies['overview'] = movies['overview'].apply(lambda x: x.split())"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 64,
357
+ "metadata": {},
358
+ "outputs": [
359
+ {
360
+ "data": {
361
+ "text/html": [
362
+ "<div>\n",
363
+ "<style scoped>\n",
364
+ " .dataframe tbody tr th:only-of-type {\n",
365
+ " vertical-align: middle;\n",
366
+ " }\n",
367
+ "\n",
368
+ " .dataframe tbody tr th {\n",
369
+ " vertical-align: top;\n",
370
+ " }\n",
371
+ "\n",
372
+ " .dataframe thead th {\n",
373
+ " text-align: right;\n",
374
+ " }\n",
375
+ "</style>\n",
376
+ "<table border=\"1\" class=\"dataframe\">\n",
377
+ " <thead>\n",
378
+ " <tr style=\"text-align: right;\">\n",
379
+ " <th></th>\n",
380
+ " <th>movie_id</th>\n",
381
+ " <th>title</th>\n",
382
+ " <th>overview</th>\n",
383
+ " <th>genres</th>\n",
384
+ " <th>keywords</th>\n",
385
+ " <th>cast</th>\n",
386
+ " <th>crew</th>\n",
387
+ " </tr>\n",
388
+ " </thead>\n",
389
+ " <tbody>\n",
390
+ " <tr>\n",
391
+ " <th>0</th>\n",
392
+ " <td>19995</td>\n",
393
+ " <td>Avatar</td>\n",
394
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
395
+ " <td>[Action, Adventure, Fantasy, Science Fiction]</td>\n",
396
+ " <td>[culture clash, future, space war, space colon...</td>\n",
397
+ " <td>[Sam Worthington, Zoe Saldana, Sigourney Weaver]</td>\n",
398
+ " <td>[James Cameron]</td>\n",
399
+ " </tr>\n",
400
+ " </tbody>\n",
401
+ "</table>\n",
402
+ "</div>"
403
+ ],
404
+ "text/plain": [
405
+ " movie_id title overview \\\n",
406
+ "0 19995 Avatar [In, the, 22nd, century,, a, paraplegic, Marin... \n",
407
+ "\n",
408
+ " genres \\\n",
409
+ "0 [Action, Adventure, Fantasy, Science Fiction] \n",
410
+ "\n",
411
+ " keywords \\\n",
412
+ "0 [culture clash, future, space war, space colon... \n",
413
+ "\n",
414
+ " cast crew \n",
415
+ "0 [Sam Worthington, Zoe Saldana, Sigourney Weaver] [James Cameron] "
416
+ ]
417
+ },
418
+ "execution_count": 64,
419
+ "metadata": {},
420
+ "output_type": "execute_result"
421
+ }
422
+ ],
423
+ "source": [
424
+ "movies.head(1)"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": 65,
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "movies['genres'] = movies['genres'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n",
434
+ "movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n",
435
+ "movies['cast'] = movies['cast'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n",
436
+ "movies['crew'] = movies['crew'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": 66,
442
+ "metadata": {},
443
+ "outputs": [
444
+ {
445
+ "data": {
446
+ "text/html": [
447
+ "<div>\n",
448
+ "<style scoped>\n",
449
+ " .dataframe tbody tr th:only-of-type {\n",
450
+ " vertical-align: middle;\n",
451
+ " }\n",
452
+ "\n",
453
+ " .dataframe tbody tr th {\n",
454
+ " vertical-align: top;\n",
455
+ " }\n",
456
+ "\n",
457
+ " .dataframe thead th {\n",
458
+ " text-align: right;\n",
459
+ " }\n",
460
+ "</style>\n",
461
+ "<table border=\"1\" class=\"dataframe\">\n",
462
+ " <thead>\n",
463
+ " <tr style=\"text-align: right;\">\n",
464
+ " <th></th>\n",
465
+ " <th>movie_id</th>\n",
466
+ " <th>title</th>\n",
467
+ " <th>overview</th>\n",
468
+ " <th>genres</th>\n",
469
+ " <th>keywords</th>\n",
470
+ " <th>cast</th>\n",
471
+ " <th>crew</th>\n",
472
+ " </tr>\n",
473
+ " </thead>\n",
474
+ " <tbody>\n",
475
+ " <tr>\n",
476
+ " <th>0</th>\n",
477
+ " <td>19995</td>\n",
478
+ " <td>Avatar</td>\n",
479
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
480
+ " <td>[Action, Adventure, Fantasy, ScienceFiction]</td>\n",
481
+ " <td>[cultureclash, future, spacewar, spacecolony, ...</td>\n",
482
+ " <td>[SamWorthington, ZoeSaldana, SigourneyWeaver]</td>\n",
483
+ " <td>[JamesCameron]</td>\n",
484
+ " </tr>\n",
485
+ " </tbody>\n",
486
+ "</table>\n",
487
+ "</div>"
488
+ ],
489
+ "text/plain": [
490
+ " movie_id title overview \\\n",
491
+ "0 19995 Avatar [In, the, 22nd, century,, a, paraplegic, Marin... \n",
492
+ "\n",
493
+ " genres \\\n",
494
+ "0 [Action, Adventure, Fantasy, ScienceFiction] \n",
495
+ "\n",
496
+ " keywords \\\n",
497
+ "0 [cultureclash, future, spacewar, spacecolony, ... \n",
498
+ "\n",
499
+ " cast crew \n",
500
+ "0 [SamWorthington, ZoeSaldana, SigourneyWeaver] [JamesCameron] "
501
+ ]
502
+ },
503
+ "execution_count": 66,
504
+ "metadata": {},
505
+ "output_type": "execute_result"
506
+ }
507
+ ],
508
+ "source": [
509
+ "movies.head(1)"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": 67,
515
+ "metadata": {},
516
+ "outputs": [],
517
+ "source": [
518
+ "movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']\n",
519
+ "df = movies[['movie_id', 'title', 'tags']]"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": 68,
525
+ "metadata": {},
526
+ "outputs": [
527
+ {
528
+ "data": {
529
+ "text/html": [
530
+ "<div>\n",
531
+ "<style scoped>\n",
532
+ " .dataframe tbody tr th:only-of-type {\n",
533
+ " vertical-align: middle;\n",
534
+ " }\n",
535
+ "\n",
536
+ " .dataframe tbody tr th {\n",
537
+ " vertical-align: top;\n",
538
+ " }\n",
539
+ "\n",
540
+ " .dataframe thead th {\n",
541
+ " text-align: right;\n",
542
+ " }\n",
543
+ "</style>\n",
544
+ "<table border=\"1\" class=\"dataframe\">\n",
545
+ " <thead>\n",
546
+ " <tr style=\"text-align: right;\">\n",
547
+ " <th></th>\n",
548
+ " <th>movie_id</th>\n",
549
+ " <th>title</th>\n",
550
+ " <th>tags</th>\n",
551
+ " </tr>\n",
552
+ " </thead>\n",
553
+ " <tbody>\n",
554
+ " <tr>\n",
555
+ " <th>0</th>\n",
556
+ " <td>19995</td>\n",
557
+ " <td>Avatar</td>\n",
558
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
559
+ " </tr>\n",
560
+ " </tbody>\n",
561
+ "</table>\n",
562
+ "</div>"
563
+ ],
564
+ "text/plain": [
565
+ " movie_id title tags\n",
566
+ "0 19995 Avatar [In, the, 22nd, century,, a, paraplegic, Marin..."
567
+ ]
568
+ },
569
+ "execution_count": 68,
570
+ "metadata": {},
571
+ "output_type": "execute_result"
572
+ }
573
+ ],
574
+ "source": [
575
+ "df.head(1)"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 69,
581
+ "metadata": {},
582
+ "outputs": [
583
+ {
584
+ "name": "stderr",
585
+ "output_type": "stream",
586
+ "text": [
587
+ "/var/folders/62/rv55r95d3xx3npwdjhz_3nbh0000gn/T/ipykernel_1863/949442192.py:1: SettingWithCopyWarning: \n",
588
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
589
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
590
+ "\n",
591
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
592
+ " df['tags'] = df['tags'].apply(lambda x: \" \".join(x))\n"
593
+ ]
594
+ }
595
+ ],
596
+ "source": [
597
+ "df['tags'] = df['tags'].apply(lambda x: \" \".join(x))"
598
+ ]
599
+ },
600
+ {
601
+ "cell_type": "code",
602
+ "execution_count": 70,
603
+ "metadata": {},
604
+ "outputs": [
605
+ {
606
+ "data": {
607
+ "text/plain": [
608
+ "'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'"
609
+ ]
610
+ },
611
+ "execution_count": 70,
612
+ "metadata": {},
613
+ "output_type": "execute_result"
614
+ }
615
+ ],
616
+ "source": [
617
+ "df.iloc[0].tags"
618
+ ]
619
+ },
620
+ {
621
+ "cell_type": "code",
622
+ "execution_count": 71,
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "name": "stderr",
627
+ "output_type": "stream",
628
+ "text": [
629
+ "/var/folders/62/rv55r95d3xx3npwdjhz_3nbh0000gn/T/ipykernel_1863/670192424.py:2: SettingWithCopyWarning: \n",
630
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
631
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
632
+ "\n",
633
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
634
+ " df['tags'] = df['tags'].apply(lambda x: x.lower())\n"
635
+ ]
636
+ }
637
+ ],
638
+ "source": [
639
+ "# Converting into lower case\n",
640
+ "df['tags'] = df['tags'].apply(lambda x: x.lower())"
641
+ ]
642
+ },
643
+ {
644
+ "cell_type": "code",
645
+ "execution_count": 89,
646
+ "metadata": {},
647
+ "outputs": [],
648
+ "source": [
649
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
650
+ "cv = CountVectorizer(max_features=5000, stop_words='english')"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "code",
655
+ "execution_count": 90,
656
+ "metadata": {},
657
+ "outputs": [
658
+ {
659
+ "data": {
660
+ "text/plain": [
661
+ "(4806, 5000)"
662
+ ]
663
+ },
664
+ "execution_count": 90,
665
+ "metadata": {},
666
+ "output_type": "execute_result"
667
+ }
668
+ ],
669
+ "source": [
670
+ "cv.fit_transform(df['tags']).toarray().shape"
671
+ ]
672
+ },
673
+ {
674
+ "cell_type": "code",
675
+ "execution_count": 91,
676
+ "metadata": {},
677
+ "outputs": [],
678
+ "source": [
679
+ "vectors = cv.fit_transform(df['tags']).toarray()"
680
+ ]
681
+ },
682
+ {
683
+ "cell_type": "code",
684
+ "execution_count": 92,
685
+ "metadata": {},
686
+ "outputs": [
687
+ {
688
+ "data": {
689
+ "text/plain": [
690
+ "array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],\n",
691
+ " dtype=object)"
692
+ ]
693
+ },
694
+ "execution_count": 92,
695
+ "metadata": {},
696
+ "output_type": "execute_result"
697
+ }
698
+ ],
699
+ "source": [
700
+ "cv.get_feature_names_out()"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "code",
705
+ "execution_count": 86,
706
+ "metadata": {},
707
+ "outputs": [],
708
+ "source": [
709
+ "# changing the verbs to their root form\n",
710
+ "from nltk.stem.porter import PorterStemmer\n",
711
+ "ps = PorterStemmer()\n"
712
+ ]
713
+ },
714
+ {
715
+ "cell_type": "code",
716
+ "execution_count": 87,
717
+ "metadata": {},
718
+ "outputs": [],
719
+ "source": [
720
+ "def stem(text):\n",
721
+ " y = []\n",
722
+ " for i in text.split():\n",
723
+ " y.append(ps.stem(i)) # ps.stem('loved') -> love\n",
724
+ " return \" \".join(y)"
725
+ ]
726
+ },
727
+ {
728
+ "cell_type": "code",
729
+ "execution_count": 88,
730
+ "metadata": {},
731
+ "outputs": [
732
+ {
733
+ "name": "stderr",
734
+ "output_type": "stream",
735
+ "text": [
736
+ "/var/folders/62/rv55r95d3xx3npwdjhz_3nbh0000gn/T/ipykernel_1863/866399325.py:1: SettingWithCopyWarning: \n",
737
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
738
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
739
+ "\n",
740
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
741
+ " df['tags'] = df['tags'].apply(stem)\n"
742
+ ]
743
+ }
744
+ ],
745
+ "source": [
746
+ "df['tags'] = df['tags'].apply(stem)"
747
+ ]
748
+ },
749
+ {
750
+ "cell_type": "code",
751
+ "execution_count": 93,
752
+ "metadata": {},
753
+ "outputs": [],
754
+ "source": [
755
+ "from sklearn.metrics.pairwise import cosine_similarity"
756
+ ]
757
+ },
758
+ {
759
+ "cell_type": "code",
760
+ "execution_count": 94,
761
+ "metadata": {},
762
+ "outputs": [
763
+ {
764
+ "data": {
765
+ "text/plain": [
766
+ "(4806, 4806)"
767
+ ]
768
+ },
769
+ "execution_count": 94,
770
+ "metadata": {},
771
+ "output_type": "execute_result"
772
+ }
773
+ ],
774
+ "source": [
775
+ "cosine_similarity(vectors).shape"
776
+ ]
777
+ },
778
+ {
779
+ "cell_type": "code",
780
+ "execution_count": 95,
781
+ "metadata": {},
782
+ "outputs": [],
783
+ "source": [
784
+ "similarity = cosine_similarity(vectors)"
785
+ ]
786
+ },
787
+ {
788
+ "cell_type": "code",
789
+ "execution_count": 96,
790
+ "metadata": {},
791
+ "outputs": [],
792
+ "source": [
793
+ "def recommend(movie):\n",
794
+ " movie_index = df[df['title'] == movie].index[0]\n",
795
+ " distances = similarity[movie_index]\n",
796
+ " movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6] # enumerate gives preserves index and lambda is used to sort the list based on 2nd element\n",
797
+ " for i in movies_list:\n",
798
+ " print(df.iloc[i[0]].title) # i[0] is the index of the movie"
799
+ ]
800
+ },
801
+ {
802
+ "cell_type": "code",
803
+ "execution_count": 109,
804
+ "metadata": {},
805
+ "outputs": [
806
+ {
807
+ "name": "stdout",
808
+ "output_type": "stream",
809
+ "text": [
810
+ "Despicable Me 2\n",
811
+ "The Croods\n",
812
+ "Penguins of Madagascar\n",
813
+ "Batman\n",
814
+ "Cars 2\n"
815
+ ]
816
+ }
817
+ ],
818
+ "source": [
819
+ "recommend('Minions')"
820
+ ]
821
+ },
822
+ {
823
+ "cell_type": "code",
824
+ "execution_count": 110,
825
+ "metadata": {},
826
+ "outputs": [],
827
+ "source": [
828
+ "import pickle"
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "code",
833
+ "execution_count": 117,
834
+ "metadata": {},
835
+ "outputs": [],
836
+ "source": [
837
+ "pickle.dump(df.to_dict(), open('movies.pkl', 'wb')) # open in write binary mode"
838
+ ]
839
+ },
840
+ {
841
+ "cell_type": "code",
842
+ "execution_count": 119,
843
+ "metadata": {},
844
+ "outputs": [],
845
+ "source": [
846
+ "pickle.dump(similarity, open('similarity.pkl', 'wb'))"
847
+ ]
848
+ },
849
+ {
850
+ "cell_type": "code",
851
+ "execution_count": null,
852
+ "metadata": {},
853
+ "outputs": [],
854
+ "source": []
855
+ }
856
+ ],
857
+ "metadata": {
858
+ "kernelspec": {
859
+ "display_name": "ML",
860
+ "language": "python",
861
+ "name": "python3"
862
+ },
863
+ "language_info": {
864
+ "codemirror_mode": {
865
+ "name": "ipython",
866
+ "version": 3
867
+ },
868
+ "file_extension": ".py",
869
+ "mimetype": "text/x-python",
870
+ "name": "python",
871
+ "nbconvert_exporter": "python",
872
+ "pygments_lexer": "ipython3",
873
+ "version": "3.12.3"
874
+ }
875
+ },
876
+ "nbformat": 4,
877
+ "nbformat_minor": 2
878
+ }