import streamlit as st from utils import DataAnalysis analysis = DataAnalysis() st.title('MovieLens Dataset Analysis') st.caption( f'👉 Number of unique movies in the dataset # :orange[{analysis.ratings.movieId.nunique()}] | 👉 Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]') # st.caption( # f'Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]') st.caption( f'👉 Average Number of ratings per user # :orange[{analysis.ratings.shape[0]/ analysis.ratings.userId.nunique() :0.2f}] | 👉 Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]') # st.caption( # f'Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]') st.header('Distribution of Ratings in the dataset') rating_count, rating_kde, rating_ecdf, rating_scatter = st.tabs( ['Ratings countplot', 'Ratings kdeplot', 'Ratings ecdfplot', 'Rating scatterplot']) with rating_count: st.pyplot(fig=analysis.ratings_countplot(), use_container_width=True) with rating_kde: st.pyplot(fig=analysis.ratings_kdeplot(), use_container_width=True) with rating_ecdf: st.pyplot(fig=analysis.ratings_ecdfplot(), use_container_width=True) with rating_scatter: st.pyplot(fig=analysis.rating_scatterplot(), use_container_width=True) st.header("Which movies are most frequently rated?") top_k_rated_movies = st.number_input( label="Top k rated movies", min_value=10, max_value=50, value=15, step=5) st.bar_chart(data=analysis.most_rated_movie(top_k=top_k_rated_movies), x='Number of Ratings', y='Movie Title') st.header("Which movie has the lowest and highest average rating?") st.caption('Click the header for sorting') rating_config, rating_data = st.columns([25, 75]) st.write(analysis.rating_stats()) st.markdown(''' `Gypsy` is one the movie with the lowest average rating, but only one person rated it. Similarly `Lamerica` may be one of tthe "highest" rated movie, but it only has 2 ratings. A better approach for evaluating movie popularity is to do look at the [Bayesian average](https://en.wikipedia.org/wiki/Bayesian_average). ''') st.write(analysis.ratings_bayesian_avg()) st.markdown("Using the Bayesian average, we see that `Shawshank Redemption`, `The Godfather`, and `The Usual Suspects` are the most highly rated movies. This result makes much more sense since these movies are critically acclaimed films.") st.header("How many movie genres are there?") st.pyplot(fig=analysis.genres_count())