SURESHBEEKHANI commited on
Commit
509f367
·
verified ·
1 Parent(s): 7f3671b

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ notebooks/data/movies.csv filter=lfs diff=lfs merge=lfs -text
notebooks/data/data_preprocessing.csv ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/data/movies.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5156dc49042d83a42e58e98526e8e0b46aa1f67a40b0e0b26b09428d8f327122
3
+ size 45718781
notebooks/data_exploration.ipynb .ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/model_training.ipynb ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "#### **Import Pandas and Load the Dataset**\n",
8
+ "We're using a tool called **pandas** to help organize and work with data more easily.\n",
9
+ "\n"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 5,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stderr",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "<>:6: SyntaxWarning: invalid escape sequence '\\d'\n",
22
+ "<>:6: SyntaxWarning: invalid escape sequence '\\d'\n",
23
+ "C:\\Users\\SURESH BEEKHANI\\AppData\\Local\\Temp\\ipykernel_2400\\1582688377.py:6: SyntaxWarning: invalid escape sequence '\\d'\n",
24
+ " df = pd.read_csv('data\\data_preprocessing.csv') # Note: We use forward slashes for better compatibility on different systems.\n"
25
+ ]
26
+ },
27
+ {
28
+ "data": {
29
+ "text/html": [
30
+ "<div>\n",
31
+ "<style scoped>\n",
32
+ " .dataframe tbody tr th:only-of-type {\n",
33
+ " vertical-align: middle;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe tbody tr th {\n",
37
+ " vertical-align: top;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe thead th {\n",
41
+ " text-align: right;\n",
42
+ " }\n",
43
+ "</style>\n",
44
+ "<table border=\"1\" class=\"dataframe\">\n",
45
+ " <thead>\n",
46
+ " <tr style=\"text-align: right;\">\n",
47
+ " <th></th>\n",
48
+ " <th>movie_id</th>\n",
49
+ " <th>title</th>\n",
50
+ " <th>tags</th>\n",
51
+ " </tr>\n",
52
+ " </thead>\n",
53
+ " <tbody>\n",
54
+ " <tr>\n",
55
+ " <th>0</th>\n",
56
+ " <td>19995</td>\n",
57
+ " <td>Avatar</td>\n",
58
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
59
+ " </tr>\n",
60
+ " <tr>\n",
61
+ " <th>1</th>\n",
62
+ " <td>285</td>\n",
63
+ " <td>Pirates of the Caribbean: At World's End</td>\n",
64
+ " <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
65
+ " </tr>\n",
66
+ " <tr>\n",
67
+ " <th>2</th>\n",
68
+ " <td>206647</td>\n",
69
+ " <td>Spectre</td>\n",
70
+ " <td>A cryptic message from Bond’s past sends him o...</td>\n",
71
+ " </tr>\n",
72
+ " <tr>\n",
73
+ " <th>3</th>\n",
74
+ " <td>49026</td>\n",
75
+ " <td>The Dark Knight Rises</td>\n",
76
+ " <td>Following the death of District Attorney Harve...</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>4</th>\n",
80
+ " <td>49529</td>\n",
81
+ " <td>John Carter</td>\n",
82
+ " <td>John Carter is a war-weary, former military ca...</td>\n",
83
+ " </tr>\n",
84
+ " </tbody>\n",
85
+ "</table>\n",
86
+ "</div>"
87
+ ],
88
+ "text/plain": [
89
+ " movie_id title \\\n",
90
+ "0 19995 Avatar \n",
91
+ "1 285 Pirates of the Caribbean: At World's End \n",
92
+ "2 206647 Spectre \n",
93
+ "3 49026 The Dark Knight Rises \n",
94
+ "4 49529 John Carter \n",
95
+ "\n",
96
+ " tags \n",
97
+ "0 In the 22nd century, a paraplegic Marine is di... \n",
98
+ "1 Captain Barbossa, long believed to be dead, ha... \n",
99
+ "2 A cryptic message from Bond’s past sends him o... \n",
100
+ "3 Following the death of District Attorney Harve... \n",
101
+ "4 John Carter is a war-weary, former military ca... "
102
+ ]
103
+ },
104
+ "execution_count": 5,
105
+ "metadata": {},
106
+ "output_type": "execute_result"
107
+ }
108
+ ],
109
+ "source": [
110
+ "# We're using a tool called \"pandas\" that helps us organize and work with data more easily.\n",
111
+ "import pandas as pd # type: ignore\n",
112
+ "\n",
113
+ "# Here, we're opening a file called 'loan_approval_dataset.csv' from a folder named 'data.'\n",
114
+ "# This file probably contains information related to loan approvals.\n",
115
+ "df = pd.read_csv('data\\data_preprocessing.csv') # Note: We use forward slashes for better compatibility on different systems.\n",
116
+ "\n",
117
+ "# Now, we'll take a quick look at the first five rows of the data to understand what it looks like.\n",
118
+ "df.head()"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "markdown",
123
+ "metadata": {},
124
+ "source": [
125
+ "#### **Import CountVectorizer for Text Feature Extraction**\n",
126
+ "We will use **CountVectorizer** from the `sklearn` library to convert text data into a matrix of token counts. This helps in preparing the text data for machine learning models."
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": 6,
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
136
+ "cv = CountVectorizer(max_features=5000,stop_words='english')\n",
137
+ " "
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 7,
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": [
146
+ "vector = cv.fit_transform(df['tags']).toarray()"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 8,
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "text/plain": [
157
+ "(4809, 5000)"
158
+ ]
159
+ },
160
+ "execution_count": 8,
161
+ "metadata": {},
162
+ "output_type": "execute_result"
163
+ }
164
+ ],
165
+ "source": [
166
+ "vector.shape"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "metadata": {},
172
+ "source": [
173
+ "#### **Import Cosine Similarity for Measuring Similarity**\n",
174
+ "We will use **cosine_similarity** from the `sklearn` library to compute the similarity between two sets of data.\n"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": 9,
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "from sklearn.metrics.pairwise import cosine_similarity"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 10,
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "similarity = cosine_similarity(vector)"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 11,
198
+ "metadata": {},
199
+ "outputs": [
200
+ {
201
+ "data": {
202
+ "text/plain": [
203
+ "np.int64(744)"
204
+ ]
205
+ },
206
+ "execution_count": 11,
207
+ "metadata": {},
208
+ "output_type": "execute_result"
209
+ }
210
+ ],
211
+ "source": [
212
+ "df[df['title'] == 'The Lego Movie'].index[0]"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "markdown",
217
+ "metadata": {},
218
+ "source": [
219
+ "#### **Define a Function to Recommend Movies**\n",
220
+ "The following function `recommend` takes a movie title as input and suggests similar movies based on precomputed similarity scores.\n"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 12,
226
+ "metadata": {},
227
+ "outputs": [],
228
+ "source": [
229
+ "def recommend(movie):\n",
230
+ " index = df[df['title'] == movie].index[0]\n",
231
+ " distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])\n",
232
+ " for i in distances[1:6]:\n",
233
+ " print(df.iloc[i[0]].title)\n",
234
+ " "
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 13,
240
+ "metadata": {},
241
+ "outputs": [
242
+ {
243
+ "name": "stdout",
244
+ "output_type": "stream",
245
+ "text": [
246
+ "The Dark Knight\n",
247
+ "Batman Begins\n",
248
+ "Batman\n",
249
+ "Batman Returns\n",
250
+ "Batman Forever\n"
251
+ ]
252
+ }
253
+ ],
254
+ "source": [
255
+ "recommend('The Dark Knight Rises')\n"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": 14,
261
+ "metadata": {},
262
+ "outputs": [
263
+ {
264
+ "name": "stdout",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "The Wind That Shakes the Barley\n",
268
+ "A Passage to India\n",
269
+ "Ramanujan\n",
270
+ "Guiana 1838\n",
271
+ "Chariots of Fire\n"
272
+ ]
273
+ }
274
+ ],
275
+ "source": [
276
+ "recommend('Gandhi')"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": 15,
282
+ "metadata": {},
283
+ "outputs": [],
284
+ "source": [
285
+ "import pickle\n",
286
+ "pickle.dump(df,open('artifacts/model.pkl','wb'))\n"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 16,
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))\n"
296
+ ]
297
+ }
298
+ ],
299
+ "metadata": {
300
+ "kernelspec": {
301
+ "display_name": "Python 3",
302
+ "language": "python",
303
+ "name": "python3"
304
+ },
305
+ "language_info": {
306
+ "codemirror_mode": {
307
+ "name": "ipython",
308
+ "version": 3
309
+ },
310
+ "file_extension": ".py",
311
+ "mimetype": "text/x-python",
312
+ "name": "python",
313
+ "nbconvert_exporter": "python",
314
+ "pygments_lexer": "ipython3",
315
+ "version": "3.12.0"
316
+ }
317
+ },
318
+ "nbformat": 4,
319
+ "nbformat_minor": 2
320
+ }