Spaces:

SURESHBEEKHANI
/

Movie-Recommender-System

Sleeping

App Files Files Community

SURESHBEEKHANI commited on Oct 25, 2024

Commit

509f367

verified ·

1 Parent(s): 7f3671b

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
notebooks/data/data_preprocessing.csv +0 -0
notebooks/data/movies.csv +3 -0
notebooks/data_exploration.ipynb .ipynb +0 -0
notebooks/model_training.ipynb +320 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+notebooks/data/movies.csv filter=lfs diff=lfs merge=lfs -text

notebooks/data/data_preprocessing.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/data/movies.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5156dc49042d83a42e58e98526e8e0b46aa1f67a40b0e0b26b09428d8f327122
+size 45718781

notebooks/data_exploration.ipynb .ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/model_training.ipynb ADDED Viewed

	@@ -0,0 +1,320 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### **Import Pandas and Load the Dataset**\n",
+    "We're using a tool called **pandas** to help organize and work with data more easily.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:6: SyntaxWarning: invalid escape sequence '\\d'\n",
+      "<>:6: SyntaxWarning: invalid escape sequence '\\d'\n",
+      "C:\\Users\\SURESH BEEKHANI\\AppData\\Local\\Temp\\ipykernel_2400\\1582688377.py:6: SyntaxWarning: invalid escape sequence '\\d'\n",
+      "  df = pd.read_csv('data\\data_preprocessing.csv')  # Note: We use forward slashes for better compatibility on different systems.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>movie_id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>19995</td>\n",
+       "      <td>Avatar</td>\n",
+       "      <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>285</td>\n",
+       "      <td>Pirates of the Caribbean: At World's End</td>\n",
+       "      <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>206647</td>\n",
+       "      <td>Spectre</td>\n",
+       "      <td>A cryptic message from Bond’s past sends him o...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>49026</td>\n",
+       "      <td>The Dark Knight Rises</td>\n",
+       "      <td>Following the death of District Attorney Harve...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>49529</td>\n",
+       "      <td>John Carter</td>\n",
+       "      <td>John Carter is a war-weary, former military ca...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   movie_id                                     title  \\\n",
+       "0     19995                                    Avatar   \n",
+       "1       285  Pirates of the Caribbean: At World's End   \n",
+       "2    206647                                   Spectre   \n",
+       "3     49026                     The Dark Knight Rises   \n",
+       "4     49529                               John Carter   \n",
+       "\n",
+       "                                                tags  \n",
+       "0  In the 22nd century, a paraplegic Marine is di...  \n",
+       "1  Captain Barbossa, long believed to be dead, ha...  \n",
+       "2  A cryptic message from Bond’s past sends him o...  \n",
+       "3  Following the death of District Attorney Harve...  \n",
+       "4  John Carter is a war-weary, former military ca...  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We're using a tool called \"pandas\" that helps us organize and work with data more easily.\n",
+    "import pandas as pd  # type: ignore\n",
+    "\n",
+    "# Here, we're opening a file called 'loan_approval_dataset.csv' from a folder named 'data.'\n",
+    "# This file probably contains information related to loan approvals.\n",
+    "df = pd.read_csv('data\\data_preprocessing.csv')  # Note: We use forward slashes for better compatibility on different systems.\n",
+    "\n",
+    "# Now, we'll take a quick look at the first five rows of the data to understand what it looks like.\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### **Import CountVectorizer for Text Feature Extraction**\n",
+    "We will use **CountVectorizer** from the `sklearn` library to convert text data into a matrix of token counts. This helps in preparing the text data for machine learning models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "cv = CountVectorizer(max_features=5000,stop_words='english')\n",
+    "   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vector = cv.fit_transform(df['tags']).toarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4809, 5000)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vector.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### **Import Cosine Similarity for Measuring Similarity**\n",
+    "We will use **cosine_similarity** from the `sklearn` library to compute the similarity between two sets of data.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics.pairwise import cosine_similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity = cosine_similarity(vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "np.int64(744)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df['title'] == 'The Lego Movie'].index[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### **Define a Function to Recommend Movies**\n",
+    "The following function `recommend` takes a movie title as input and suggests similar movies based on precomputed similarity scores.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def recommend(movie):\n",
+    "    index = df[df['title'] == movie].index[0]\n",
+    "    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])\n",
+    "    for i in distances[1:6]:\n",
+    "        print(df.iloc[i[0]].title)\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The Dark Knight\n",
+      "Batman Begins\n",
+      "Batman\n",
+      "Batman Returns\n",
+      "Batman Forever\n"
+     ]
+    }
+   ],
+   "source": [
+    "recommend('The Dark Knight Rises')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The Wind That Shakes the Barley\n",
+      "A Passage to India\n",
+      "Ramanujan\n",
+      "Guiana 1838\n",
+      "Chariots of Fire\n"
+     ]
+    }
+   ],
+   "source": [
+    "recommend('Gandhi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "pickle.dump(df,open('artifacts/model.pkl','wb'))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump(similarity,open('artifacts/similarity.pkl','wb'))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}