Spaces:

Ajay-user
/

Movie-Recommender-System

Runtime error

App Files Files Community

Ajay-user commited on Aug 22, 2023

Commit

f12e569

1 Parent(s): cfbe0f5

movie recommedation

Browse files

Files changed (7) hide show

Home.py +70 -0
README.md +2 -2
data/movies.csv +0 -0
data/ratings.csv +0 -0
pages/1_Dataset_Analysis.py +59 -0
requirements.txt +59 -0
utils.py +250 -0

Home.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import streamlit as st
+from utils import Collaborative_filtering, Content_based_filtering
+st.title("MovieLens Recommender System")
+st.caption(
+    "MovieLens is a recommender system that was developed by GroupLens, a computer science research lab at the University of Minnesota. It recommends movies to its users based on their movie ratings. It is also a dataset that is widely used in research and teaching contexts. [link](https://grouplens.org/datasets/movielens/)")
+rec_1 = Collaborative_filtering()
+rec_2 = Content_based_filtering()
+movie_title = st.selectbox("Select a movie you like 💖",
+                           options=rec_1.movies.title.to_list())
+tab_1, tab_2, tab_3, tab_4 = st.tabs(
+    ["Collaborative Filtering", "Content-based Filtering", "Matrix Factorization", "Content-based Filtering from Feedback"])
+with tab_1:
+    st.caption("""I'm going to use a technique called colaborative filtering to generate recommendations for you. This technique is based on the premise that similar people like similar things. The beauty of collaborative filtering is that it doesn't require any information about the users or the movies to generate recommendations.""")
+    st.caption(
+        f"🎬 You selected `{movie_title}` so I would recommed the following")
+    for movie in rec_1.find_similar_movies(movie_title):
+        st.code("🍿 "+movie)
+with tab_2:
+    st.caption("""The **cold start problem** is when there are new users and movies in our matrix that do not have any ratings. How do we handle the cold-start problem.
+Collaborative filtering relies solely on user-item interactions within the utility matrix. The issue with this approach is that brand new users or items with no iteractions get excluded from the recommendation system. This is called the **cold start problem**. Content-based filtering is a way to handle this problem by generating recommendations based on user and item features.""")
+    st.caption(
+        f"🎬 You selected `{movie_title}` so I would recommed the following")
+    for movie in rec_2.find_similar_movies(movie_title):
+        st.code("🍿 "+movie)
+with tab_3:
+    st.caption("""Matrix factorization (MF) is a linear algebra technique that can help us discover latent features underlying the interactions between users and movies. These latent features give a more compact representation of user tastes and item descriptions. MF is particularly useful for very sparse data and can enhance the quality of recommendations. The algorithm works by factorizing the original user-item matrix into two factor matrices:
+- user-factor matrix (n_users, k)
+- item-factor matrix (k, n_items)""")
+    st.caption(
+        f"🎬 You selected `{movie_title}` so I would recommed the following")
+    for movie in rec_1.find_similar_movies(movie_title, use_matrix_factorization=True):
+        st.code("🍿 "+movie)
+with tab_4:
+    st.caption("The **cold start problem** is when there are new users and movies in our matrix that do not have any ratings. Content-based filtering uses user-item features to recommend other items similar to what the user likes, based on their previous actions or explicit feedback.")
+    col_1, col_2 = st.columns(spec=[25, 75])
+    with col_1:
+        movie_id = rec_2.movie_id_title_invmap[movie_title]
+        movie_index = rec_2.movie_id_index_map[movie_id]
+        row = rec_2.user_feature_matrix.iloc[movie_index]
+        options = []
+        st.caption("**Feedback**")
+        for k, v in row.items():
+            options.append(st.checkbox(label=k, value=bool(v)))
+    with col_2:
+        vector = []
+        for item in options:
+            vector.append(item)
+        st.caption("**Movie Recommendation based on feedback**")
+        for movie in rec_2.find_similar_movies_based_on_feedback(vector):
+            st.code("🍿 "+movie)

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
 title: Movie Recommender System
-emoji: 🦀
 colorFrom: gray
 colorTo: purple
 sdk: streamlit
 sdk_version: 1.25.0
-app_file: app.py
 pinned: false
 ---

 ---
 title: Movie Recommender System
+emoji: 🍿🎬
 colorFrom: gray
 colorTo: purple
 sdk: streamlit
 sdk_version: 1.25.0
+app_file: Home.py
 pinned: false
 ---

data/movies.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/ratings.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

pages/1_Dataset_Analysis.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import streamlit as st
+from utils import DataAnalysis
+analysis = DataAnalysis()
+st.title('MovieLens Dataset Analysis')
+st.caption(
+    f'👉 Number of unique movies in the dataset # :orange[{analysis.ratings.movieId.nunique()}] |               👉 Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]')
+# st.caption(
+#     f'Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]')
+st.caption(
+    f'👉 Average Number of ratings per user # :orange[{analysis.ratings.shape[0]/ analysis.ratings.userId.nunique() :0.2f}] | 👉 Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]')
+# st.caption(
+#     f'Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]')
+st.header('Distribution of Ratings in the dataset')
+rating_count, rating_kde, rating_ecdf, rating_scatter = st.tabs(
+    ['Ratings countplot', 'Ratings kdeplot', 'Ratings ecdfplot', 'Rating scatterplot'])
+with rating_count:
+    st.pyplot(fig=analysis.ratings_countplot(), use_container_width=True)
+with rating_kde:
+    st.pyplot(fig=analysis.ratings_kdeplot(), use_container_width=True)
+with rating_ecdf:
+    st.pyplot(fig=analysis.ratings_ecdfplot(), use_container_width=True)
+with rating_scatter:
+    st.pyplot(fig=analysis.rating_scatterplot(), use_container_width=True)
+st.header("Which movies are most frequently rated?")
+top_k_rated_movies = st.number_input(
+    label="Top k rated movies", min_value=10, max_value=50, value=15, step=5)
+st.bar_chart(data=analysis.most_rated_movie(top_k=top_k_rated_movies),
+             x='Number of Ratings', y='Movie Title')
+st.header("Which movie has the lowest and highest average rating?")
+st.caption('Click the header for sorting')
+rating_config, rating_data = st.columns([25, 75])
+st.write(analysis.rating_stats())
+st.markdown('''
+`Gypsy` is one the movie with the lowest average rating, but only one person rated it.
+Similarly `Lamerica` may be one of tthe "highest" rated movie, but it only has 2 ratings.
+A better approach for evaluating movie popularity is to do look at the [Bayesian average](https://en.wikipedia.org/wiki/Bayesian_average).
+        ''')
+st.write(analysis.ratings_bayesian_avg())
+st.markdown("Using the Bayesian average, we see that `Shawshank Redemption`, `The Godfather`, and `The Usual Suspects` are the most highly rated movies. This result makes much more sense since these movies are critically acclaimed films.")
+st.header("How many movie genres are there?")
+st.pyplot(fig=analysis.genres_count())

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+altair==5.0.1
+attrs==23.1.0
+blinker==1.6.2
+cachetools==5.3.1
+certifi==2023.7.22
+charset-normalizer==3.2.0
+click==8.1.7
+colorama==0.4.6
+contourpy==1.1.0
+cycler==0.11.0
+fonttools==4.42.1
+gitdb==4.0.10
+GitPython==3.1.32
+idna==3.4
+importlib-metadata==6.8.0
+Jinja2==3.1.2
+joblib==1.3.2
+jsonschema==4.19.0
+jsonschema-specifications==2023.7.1
+kiwisolver==1.4.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.7.2
+mdurl==0.1.2
+numpy==1.25.2
+packaging==23.1
+pandas==2.0.3
+Pillow==9.5.0
+protobuf==4.24.1
+pyarrow==12.0.1
+pydeck==0.8.0
+Pygments==2.16.1
+Pympler==1.0.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+referencing==0.30.2
+requests==2.31.0
+rich==13.5.2
+rpds-py==0.9.2
+scikit-learn==1.3.0
+scipy==1.11.2
+seaborn==0.12.2
+six==1.16.0
+smmap==5.0.0
+streamlit==1.25.0
+tenacity==8.2.3
+threadpoolctl==3.2.0
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.3
+typing_extensions==4.7.1
+tzdata==2023.3
+tzlocal==4.3.1
+urllib3==2.0.4
+validators==0.21.2
+watchdog==3.0.0
+zipp==3.16.2

utils.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.decomposition import TruncatedSVD
+from sklearn.neighbors import NearestNeighbors
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.sparse import csr_matrix
+from collections import Counter
+from functools import cached_property
+plt.style.use("fivethirtyeight")
+class DataAnalysis:
+    def __init__(self) -> None:
+        self.movies: pd.DataFrame = pd.read_csv(
+            './data/movies.csv')
+        self.ratings: pd.DataFrame = pd.read_csv(
+            './data/ratings.csv')
+    def ratings_countplot(self,):
+        fig, ax = plt.subplots(nrows=1, ncols=1)
+        sns.countplot(data=self.ratings, x='rating', ax=ax)
+        return fig
+    def ratings_kdeplot(self,):
+        fig, ax = plt.subplots(nrows=1, ncols=1)
+        sns.kdeplot(data=self.ratings, x='rating', ax=ax)
+        return fig
+    def ratings_ecdfplot(self,):
+        fig, ax = plt.subplots(nrows=1, ncols=1)
+        sns.ecdfplot(data=self.ratings, x='rating', ax=ax)
+        return fig
+    def rating_scatterplot(self,):
+        fig, ax = plt.subplots(nrows=1, ncols=1)
+        self.ratings[['userId', 'rating']].groupby(
+            'userId').mean().plot(ls='', marker='.', ax=ax)
+        ax.axhline(
+            y=self.ratings[['userId', 'rating']].groupby(
+                'userId').mean().mean().values.item(),
+            color='red', alpha=0.5
+        )
+        ax.legend(['Mean user rating', 'Mean rating across users'])
+        return fig
+    def most_rated_movie(self, top_k=10):
+        data = (self.ratings.movieId.value_counts()
+                .reset_index()
+                .merge(right=self.movies[['movieId', 'title']], on='movieId')[['title', 'count']]
+                .rename({'count': 'Number of Ratings', 'title': 'Movie Title'}, axis=1))
+        return data.head(top_k)
+    def rating_stats(self,):
+        avg_movie_rating = (self.ratings[['movieId', 'rating']]
+                            .groupby('movieId').agg(['mean', 'count'])
+                            .droplevel(axis=1, level=0)
+                            .reset_index(level=0))
+        avg_movie_rating = avg_movie_rating.merge(
+            self.movies[['movieId', 'title']], on='movieId')
+        avg_movie_rating = (avg_movie_rating
+                            .rename(axis=1,
+                                    mapper={'mean': 'Average Rating',
+                                            'count': "Number of Rating",
+                                            'title': 'Movie Title',
+                                            'genres': 'Genres'}
+                                    ))
+        avg_movie_rating = avg_movie_rating.drop(columns='movieId')
+        return avg_movie_rating
+    def bayesian_avg(self, C: float, m: float):
+        return lambda rating: (C*m + rating.sum()) / (C + rating.count())
+    def ratings_bayesian_avg(self,):
+        rating_agg = (self.ratings[['rating', 'movieId']]
+                      .groupby('movieId').agg(['mean', 'count'])
+                      .droplevel(axis=1, level=0)
+                      .reset_index()
+                      )
+        C = rating_agg['count'].mean()
+        m = rating_agg['mean'].mean()
+        bay_avg_fn = self.bayesian_avg(C=C, m=m)
+        rating_bay_avg = (self.ratings[['rating', 'movieId']]
+                          .groupby('movieId').agg([bay_avg_fn, 'count'])
+                          ).droplevel(level=0, axis=1).reset_index(level=0)
+        rating_bay_avg = rating_bay_avg.merge(
+            self.movies[['title', 'movieId']], on='movieId')
+        rating_bay_avg = rating_bay_avg.rename({'<lambda_0>': 'Bayesian Average',
+                                                'count': 'Number of ratings', 'title': 'Movie Title'}, axis=1)
+        return rating_bay_avg.drop(columns=['movieId'])
+    def genres_count(self,):
+        movie_genres = self.movies.copy()
+        movie_genres.genres = self.movies.genres.str.split(pat='|')
+        genre_counter = Counter(
+            [genre for genres in movie_genres.genres for genre in genres])
+        genre_counter_df = pd.DataFrame(
+            data=dict(genre_counter.most_common()), index=['Count'])
+        genre_counter_df.columns.name = "Genres"
+        genre_counter_df = genre_counter_df.T.reset_index()
+        fig, ax = plt.subplots(nrows=1, ncols=1)
+        sns.barplot(data=genre_counter_df, x='Count', y='Genres', ax=ax)
+        return fig
+class Recommender:
+    def __init__(self) -> None:
+        self.ratings: pd.DataFrame = pd.read_csv(
+            './data/ratings.csv')
+        self.movies: pd.DataFrame = pd.read_csv(
+            './data/movies.csv')
+        self.M: int = self.ratings.userId.nunique()
+        self.N: int = self.ratings.movieId.nunique()
+        self.ratings_userid_index_map = dict(
+            zip(self.ratings.userId.unique(), range(self.M)))
+        self.ratings_movieid_index_map = dict(
+            zip(self.ratings.movieId.unique(), range(self.N)))
+        self.ratings_userid_index_invmap = dict(
+            zip(range(self.M), self.ratings.userId.unique()))
+        self.ratings_movieid_index_invmap = dict(
+            zip(range(self.N), self.ratings.movieId.unique()))
+        self.movie_id_title_map = dict(
+            zip(self.movies.movieId, self.movies.title))
+        self.movie_id_title_invmap = dict(
+            zip(self.movies.title, self.movies.movieId))
+        self.movie_id_index_map = dict(
+            zip(self.movies.movieId, self.movies.index))
+        self.movie_id_index_invmap = dict(
+            zip(self.movies.index, self.movies.movieId))
+    def nearest_neighbors(self, matrix: np.ndarray | csr_matrix):
+        knn = NearestNeighbors(
+            n_neighbors=10, algorithm="brute", metric="cosine")
+        knn.fit(matrix)
+        return knn
+    def output_recommendation(self, search_id: int, similar_movies: np.ndarray, mapper_index_id: dict):
+        response = []
+        for i in similar_movies:
+            movie_id = mapper_index_id[i]
+            if movie_id != search_id:
+                response.append(self.movie_id_title_map[movie_id])
+        return response
+class Collaborative_filtering(Recommender):
+    def __init__(self) -> None:
+        super(Collaborative_filtering, self).__init__()
+        pass
+    @cached_property
+    def user_item_matrix(self,) -> csr_matrix:
+        # build user-item matrix
+        user_index = [self.ratings_userid_index_map[id]
+                      for id in self.ratings.userId]
+        movie_index = [self.ratings_movieid_index_map[id]
+                       for id in self.ratings.movieId]
+        user_item_matrix = csr_matrix(
+            (self.ratings.rating, (user_index, movie_index)), shape=(self.M, self.N))
+        return user_item_matrix
+    @cached_property
+    def matrix_factorization(self,) -> np.ndarray:
+        svd = TruncatedSVD(n_components=20, n_iter=10, random_state=42)
+        Q = svd.fit_transform(self.user_item_matrix.T)
+        return Q
+    def find_similar_movies(self, title: str, k: int = 11, use_matrix_factorization=False) -> np.ndarray:
+        search_id: int = self.movie_id_title_invmap[title]
+        movie_index: int = self.ratings_movieid_index_map[search_id]
+        if use_matrix_factorization:
+            matrix: np.ndarray = self.matrix_factorization
+        else:
+            matrix: csr_matrix = self.user_item_matrix.T
+        movie_vector: np.ndarray = matrix[movie_index]
+        if isinstance(movie_vector, np.ndarray):
+            movie_vector = movie_vector.reshape((1, -1))
+        knn = self.nearest_neighbors(matrix=matrix)
+        neighbors: np.ndarray = knn.kneighbors(
+            movie_vector, n_neighbors=k, return_distance=False)
+        response = self.output_recommendation(
+            search_id=search_id,
+            similar_movies=neighbors[0],
+            mapper_index_id=self.ratings_movieid_index_invmap)
+        return response
+class Content_based_filtering(Recommender):
+    def __init__(self) -> None:
+        super(Content_based_filtering, self).__init__()
+    @cached_property
+    def user_feature_matrix(self,):
+        movie_genres = self.movies.copy()
+        movie_genres.genres = self.movies.genres.str.split(pat='|')
+        genres = set(
+            [genre_ for genres_ in movie_genres.genres for genre_ in genres_])
+        for genre in genres:
+            movie_genres[genre] = movie_genres.genres.transform(
+                lambda x: int(genre in x))
+        user_feature_matrix = movie_genres.drop(
+            columns=['movieId', 'title', 'genres'])
+        return user_feature_matrix
+    @cached_property
+    def cosine_similarity(self):
+        user_feature_matrix = self.user_feature_matrix
+        similarity_matirx = cosine_similarity(
+            user_feature_matrix, user_feature_matrix)
+        return similarity_matirx
+    def find_similar_movies(self, title: str, k: int = 11):
+        search_id: int = self.movie_id_title_invmap[title]
+        search_index: int = self.movie_id_index_map[search_id]
+        scores: np.ndarray = self.cosine_similarity[search_index]
+        scores: list[tuple[int, float]] = list(zip(self.movies.index, scores))
+        scores = sorted(scores, key=lambda x: x[1], reverse=True)
+        neighbors: list[int] = [item[0] for item in scores[:k]]
+        response = self.output_recommendation(
+            search_id=search_id,
+            similar_movies=neighbors,
+            mapper_index_id=self.movie_id_index_invmap)
+        return response
+    def find_similar_movies_based_on_feedback(self, vector: list[bool], k: int = 11):
+        feedback_vector = np.array(vector, dtype=int).reshape((1, -1))
+        knn = self.nearest_neighbors(matrix=self.user_feature_matrix)
+        neighbors: np.ndarray = knn.kneighbors(
+            feedback_vector, n_neighbors=k, return_distance=False)
+        response = self.output_recommendation(
+            search_id=-1,
+            similar_movies=neighbors[0],
+            mapper_index_id=self.movie_id_index_invmap)
+        return response