Spaces:

NishantD
/

Movie_Recommender

Sleeping

App Files Files Community

NishantD commited on May 22, 2024

Commit

86f930d

verified ·

1 Parent(s): e8c0f75

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +68 -0
movies.pkl +3 -0
similarity.pkl +3 -0
tmdb_5000_credits.csv +3 -0
tmdb_5000_movies.csv +0 -0
train.ipynb +878 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tmdb_5000_credits.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import streamlit as st
+import pickle
+import pandas as pd
+import requests
+api_key = 'd9f397a605c439a3b316dc3492e286c2'
+movies_dict = pickle.load(open('movies.pkl', 'rb')) # open the file in read mode
+movies = pd.DataFrame(movies_dict)
+st.title('Movie Recommender System')
+selected_movie_name = st.selectbox('Select a movie:', movies['title'].values)
+similarity = pickle.load(open('similarity.pkl', 'rb'))
+def fetch_poster(movie_id):
+    response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}')
+    data = response.json()
+    return "https://image.tmdb.org/t/p/w500/" + data['poster_path']
+def recommend(movie):
+    movie_index = movies[movies['title'] == movie].index[0]
+    distances = similarity[movie_index]
+    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
+    recommend_movies = []
+    recomended_movies_posters = []
+    for i in movies_list:
+        movie_id = movies.iloc[i[0]].movie_id
+        recommend_movies.append(movies.iloc[i[0]].title)
+        #fetch poster from API
+        poster = fetch_poster(movie_id)
+        recomended_movies_posters.append(poster)
+    return recommend_movies, recomended_movies_posters
+if st.button('Recommend'):
+    st.write('You have selected:', selected_movie_name)
+    recommendations, posters = recommend(selected_movie_name)
+    st.write('Recommendations are : ')
+    col1, col2, col3, col4, col5 = st.columns(5)
+    with col1:
+        st.text(recommendations[0])
+        st.image(posters[0])
+    with col2:
+        st.text(recommendations[1])
+        st.image(posters[1])
+    with col3:
+        st.text(recommendations[2])
+        st.image(posters[2])
+    with col4:
+        st.text(recommendations[3])
+        st.image(posters[3])
+    with col5:
+        st.text(recommendations[4])
+        st.image(posters[4])

movies.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b55dc3dafba86e695ca5c37287039ab38c547b993bb337304812c1cfc80ac3b
+size 2216684

similarity.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab55d4e4c93cf300f38c25699e36ff1c694add725f3940279b4775e941f42e98
+size 184781251

tmdb_5000_credits.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d0050599ff88d40366c4841204b1489862bca346bfa46c20b05a65d14508435
+size 40044293

tmdb_5000_movies.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

train.ipynb ADDED Viewed

	@@ -0,0 +1,878 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies = pd.read_csv('tmdb_5000_movies.csv')\n",
+    "credits = pd.read_csv('tmdb_5000_credits.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',\n",
+       "        'original_title', 'overview', 'popularity', 'production_companies',\n",
+       "        'production_countries', 'release_date', 'revenue', 'runtime',\n",
+       "        'spoken_languages', 'status', 'tagline', 'title', 'vote_average',\n",
+       "        'vote_count'],\n",
+       "       dtype='object'),\n",
+       " Index(['movie_id', 'title', 'cast', 'crew'], dtype='object'))"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "movies.columns, credits.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# title is same in both datasets\n",
+    "movies = movies.merge(credits, on='title')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>budget</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>homepage</th>\n",
+       "      <th>id</th>\n",
+       "      <th>keywords</th>\n",
+       "      <th>original_language</th>\n",
+       "      <th>original_title</th>\n",
+       "      <th>overview</th>\n",
+       "      <th>popularity</th>\n",
+       "      <th>production_companies</th>\n",
+       "      <th>...</th>\n",
+       "      <th>runtime</th>\n",
+       "      <th>spoken_languages</th>\n",
+       "      <th>status</th>\n",
+       "      <th>tagline</th>\n",
+       "      <th>title</th>\n",
+       "      <th>vote_average</th>\n",
+       "      <th>vote_count</th>\n",
+       "      <th>movie_id</th>\n",
+       "      <th>cast</th>\n",
+       "      <th>crew</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>237000000</td>\n",
+       "      <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
+       "      <td>http://www.avatarmovie.com/</td>\n",
+       "      <td>19995</td>\n",
+       "      <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
+       "      <td>en</td>\n",
+       "      <td>Avatar</td>\n",
+       "      <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
+       "      <td>150.437577</td>\n",
+       "      <td>[{\"name\": \"Ingenious Film Partners\", \"id\": 289...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>162.0</td>\n",
+       "      <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...</td>\n",
+       "      <td>Released</td>\n",
+       "      <td>Enter the World of Pandora.</td>\n",
+       "      <td>Avatar</td>\n",
+       "      <td>7.2</td>\n",
+       "      <td>11800</td>\n",
+       "      <td>19995</td>\n",
+       "      <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
+       "      <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1 rows × 23 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      budget                                             genres  \\\n",
+       "0  237000000  [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...   \n",
+       "\n",
+       "                      homepage     id  \\\n",
+       "0  http://www.avatarmovie.com/  19995   \n",
+       "\n",
+       "                                            keywords original_language  \\\n",
+       "0  [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...                en   \n",
+       "\n",
+       "  original_title                                           overview  \\\n",
+       "0         Avatar  In the 22nd century, a paraplegic Marine is di...   \n",
+       "\n",
+       "   popularity                               production_companies  ... runtime  \\\n",
+       "0  150.437577  [{\"name\": \"Ingenious Film Partners\", \"id\": 289...  ...   162.0   \n",
+       "\n",
+       "                                    spoken_languages    status  \\\n",
+       "0  [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...  Released   \n",
+       "\n",
+       "                       tagline   title vote_average vote_count movie_id  \\\n",
+       "0  Enter the World of Pandora.  Avatar          7.2      11800    19995   \n",
+       "\n",
+       "                                                cast  \\\n",
+       "0  [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...   \n",
+       "\n",
+       "                                                crew  \n",
+       "0  [{\"credit_id\": \"52fe48009251416c750aca23\", \"de...  \n",
+       "\n",
+       "[1 rows x 23 columns]"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "movies.head(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# selecting columns\n",
+    "\n",
+    "movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 4809 entries, 0 to 4808\n",
+      "Data columns (total 7 columns):\n",
+      " #   Column    Non-Null Count  Dtype \n",
+      "---  ------    --------------  ----- \n",
+      " 0   movie_id  4809 non-null   int64 \n",
+      " 1   title     4809 non-null   object\n",
+      " 2   overview  4806 non-null   object\n",
+      " 3   genres    4809 non-null   object\n",
+      " 4   keywords  4809 non-null   object\n",
+      " 5   cast      4809 non-null   object\n",
+      " 6   crew      4809 non-null   object\n",
+      "dtypes: int64(1), object(6)\n",
+      "memory usage: 263.1+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "movies.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies.dropna(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "movies.duplicated().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"name\": \"Fantasy\"}, {\"id\": 878, \"name\": \"Science Fiction\"}]'"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "movies.iloc[0].genres"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast \n",
+    "def conversion(obj):\n",
+    "    l = []\n",
+    "    for i in ast.literal_eval(obj):\n",
+    "        l.append(i['name'])\n",
+    "    return l\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies['genres'] =  movies['genres'].apply(conversion)\n",
+    "movies['keywords'] =  movies['keywords'].apply(conversion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for cast, we need to get top 3 actors\n",
+    "def conversion(obj):\n",
+    "    l = []\n",
+    "    counter = 0\n",
+    "    for i in ast.literal_eval(obj):\n",
+    "        if counter != 3:\n",
+    "            l.append(i['name'])\n",
+    "            counter += 1\n",
+    "        else:\n",
+    "            break\n",
+    "    return l"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies['cast'] =  movies['cast'].apply(conversion)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_director(obj):\n",
+    "    l = []\n",
+    "    for i in ast.literal_eval(obj):\n",
+    "        if i['job'] == 'Director':    \n",
+    "            l.append(i['name'])\n",
+    "            break\n",
+    "    return l"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies['crew'] = movies['crew'].apply(fetch_director)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#converting overview into string   \n",
+    "movies['overview'] = movies['overview'].apply(lambda x: x.split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>movie_id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>overview</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>keywords</th>\n",
+       "      <th>cast</th>\n",
+       "      <th>crew</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>19995</td>\n",
+       "      <td>Avatar</td>\n",
+       "      <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
+       "      <td>[Action, Adventure, Fantasy, Science Fiction]</td>\n",
+       "      <td>[culture clash, future, space war, space colon...</td>\n",
+       "      <td>[Sam Worthington, Zoe Saldana, Sigourney Weaver]</td>\n",
+       "      <td>[James Cameron]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   movie_id   title                                           overview  \\\n",
+       "0     19995  Avatar  [In, the, 22nd, century,, a, paraplegic, Marin...   \n",
+       "\n",
+       "                                          genres  \\\n",
+       "0  [Action, Adventure, Fantasy, Science Fiction]   \n",
+       "\n",
+       "                                            keywords  \\\n",
+       "0  [culture clash, future, space war, space colon...   \n",
+       "\n",
+       "                                               cast             crew  \n",
+       "0  [Sam Worthington, Zoe Saldana, Sigourney Weaver]  [James Cameron]  "
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "movies.head(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies['genres'] = movies['genres'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n",
+    "movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n",
+    "movies['cast'] = movies['cast'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n",
+    "movies['crew'] = movies['crew'].apply(lambda x: [i.replace(\" \", \"\") for i in x])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>movie_id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>overview</th>\n",
+       "      <th>genres</th>\n",
+       "      <th>keywords</th>\n",
+       "      <th>cast</th>\n",
+       "      <th>crew</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>19995</td>\n",
+       "      <td>Avatar</td>\n",
+       "      <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
+       "      <td>[Action, Adventure, Fantasy, ScienceFiction]</td>\n",
+       "      <td>[cultureclash, future, spacewar, spacecolony, ...</td>\n",
+       "      <td>[SamWorthington, ZoeSaldana, SigourneyWeaver]</td>\n",
+       "      <td>[JamesCameron]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   movie_id   title                                           overview  \\\n",
+       "0     19995  Avatar  [In, the, 22nd, century,, a, paraplegic, Marin...   \n",
+       "\n",
+       "                                         genres  \\\n",
+       "0  [Action, Adventure, Fantasy, ScienceFiction]   \n",
+       "\n",
+       "                                            keywords  \\\n",
+       "0  [cultureclash, future, spacewar, spacecolony, ...   \n",
+       "\n",
+       "                                            cast            crew  \n",
+       "0  [SamWorthington, ZoeSaldana, SigourneyWeaver]  [JamesCameron]  "
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "movies.head(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']\n",
+    "df = movies[['movie_id', 'title', 'tags']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>movie_id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>tags</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>19995</td>\n",
+       "      <td>Avatar</td>\n",
+       "      <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   movie_id   title                                               tags\n",
+       "0     19995  Avatar  [In, the, 22nd, century,, a, paraplegic, Marin..."
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/62/rv55r95d3xx3npwdjhz_3nbh0000gn/T/ipykernel_1863/949442192.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['tags'] = df['tags'].apply(lambda x: \" \".join(x))\n"
+     ]
+    }
+   ],
+   "source": [
+    "df['tags'] = df['tags'].apply(lambda x: \" \".join(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.iloc[0].tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/62/rv55r95d3xx3npwdjhz_3nbh0000gn/T/ipykernel_1863/670192424.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['tags'] = df['tags'].apply(lambda x: x.lower())\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Converting into lower case\n",
+    "df['tags'] = df['tags'].apply(lambda x: x.lower())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "cv = CountVectorizer(max_features=5000, stop_words='english')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4806, 5000)"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv.fit_transform(df['tags']).toarray().shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectors = cv.fit_transform(df['tags']).toarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv.get_feature_names_out()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# changing the verbs to their root form\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "ps = PorterStemmer()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def stem(text):\n",
+    "    y = []\n",
+    "    for i in text.split():\n",
+    "        y.append(ps.stem(i))  # ps.stem('loved') -> love\n",
+    "    return \" \".join(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/62/rv55r95d3xx3npwdjhz_3nbh0000gn/T/ipykernel_1863/866399325.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['tags'] = df['tags'].apply(stem)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df['tags'] = df['tags'].apply(stem)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics.pairwise import cosine_similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4806, 4806)"
+      ]
+     },
+     "execution_count": 94,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cosine_similarity(vectors).shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarity = cosine_similarity(vectors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def recommend(movie):\n",
+    "    movie_index = df[df['title'] == movie].index[0]\n",
+    "    distances = similarity[movie_index]\n",
+    "    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6] # enumerate gives preserves index and lambda is used to sort the list based on 2nd element\n",
+    "    for i in movies_list:\n",
+    "        print(df.iloc[i[0]].title)  # i[0] is the index of the movie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Despicable Me 2\n",
+      "The Croods\n",
+      "Penguins of Madagascar\n",
+      "Batman\n",
+      "Cars 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "recommend('Minions')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump(df.to_dict(), open('movies.pkl', 'wb')) # open in write binary mode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 119,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle.dump(similarity, open('similarity.pkl', 'wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ML",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}