Movie-Recommender-System / pages /1_Dataset_Analysis.py
Ajay-user's picture
movie recommedation
f12e569
import streamlit as st
from utils import DataAnalysis
analysis = DataAnalysis()
st.title('MovieLens Dataset Analysis')
st.caption(
f'πŸ‘‰ Number of unique movies in the dataset # :orange[{analysis.ratings.movieId.nunique()}] | πŸ‘‰ Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]')
# st.caption(
# f'Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]')
st.caption(
f'πŸ‘‰ Average Number of ratings per user # :orange[{analysis.ratings.shape[0]/ analysis.ratings.userId.nunique() :0.2f}] | πŸ‘‰ Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]')
# st.caption(
# f'Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]')
st.header('Distribution of Ratings in the dataset')
rating_count, rating_kde, rating_ecdf, rating_scatter = st.tabs(
['Ratings countplot', 'Ratings kdeplot', 'Ratings ecdfplot', 'Rating scatterplot'])
with rating_count:
st.pyplot(fig=analysis.ratings_countplot(), use_container_width=True)
with rating_kde:
st.pyplot(fig=analysis.ratings_kdeplot(), use_container_width=True)
with rating_ecdf:
st.pyplot(fig=analysis.ratings_ecdfplot(), use_container_width=True)
with rating_scatter:
st.pyplot(fig=analysis.rating_scatterplot(), use_container_width=True)
st.header("Which movies are most frequently rated?")
top_k_rated_movies = st.number_input(
label="Top k rated movies", min_value=10, max_value=50, value=15, step=5)
st.bar_chart(data=analysis.most_rated_movie(top_k=top_k_rated_movies),
x='Number of Ratings', y='Movie Title')
st.header("Which movie has the lowest and highest average rating?")
st.caption('Click the header for sorting')
rating_config, rating_data = st.columns([25, 75])
st.write(analysis.rating_stats())
st.markdown('''
`Gypsy` is one the movie with the lowest average rating, but only one person rated it.
Similarly `Lamerica` may be one of tthe "highest" rated movie, but it only has 2 ratings.
A better approach for evaluating movie popularity is to do look at the [Bayesian average](https://en.wikipedia.org/wiki/Bayesian_average).
''')
st.write(analysis.ratings_bayesian_avg())
st.markdown("Using the Bayesian average, we see that `Shawshank Redemption`, `The Godfather`, and `The Usual Suspects` are the most highly rated movies. This result makes much more sense since these movies are critically acclaimed films.")
st.header("How many movie genres are there?")
st.pyplot(fig=analysis.genres_count())