import streamlit as st
from utils import DataAnalysis


analysis = DataAnalysis()


st.title('MovieLens Dataset Analysis')

st.caption(
    f'👉 Number of unique movies in the dataset # :orange[{analysis.ratings.movieId.nunique()}] |               👉 Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]')
# st.caption(
#     f'Number of unique users in the dataset # :orange[{analysis.ratings.userId.nunique()}]')
st.caption(
    f'👉 Average Number of ratings per user # :orange[{analysis.ratings.shape[0]/ analysis.ratings.userId.nunique() :0.2f}] | 👉 Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]')
# st.caption(
#     f'Average Number of ratings per movie # :orange[{analysis.ratings.shape[0]/ analysis.ratings.movieId.nunique() :0.2f}]')


st.header('Distribution of Ratings in the dataset')
rating_count, rating_kde, rating_ecdf, rating_scatter = st.tabs(
    ['Ratings countplot', 'Ratings kdeplot', 'Ratings ecdfplot', 'Rating scatterplot'])
with rating_count:
    st.pyplot(fig=analysis.ratings_countplot(), use_container_width=True)
with rating_kde:
    st.pyplot(fig=analysis.ratings_kdeplot(), use_container_width=True)
with rating_ecdf:
    st.pyplot(fig=analysis.ratings_ecdfplot(), use_container_width=True)
with rating_scatter:
    st.pyplot(fig=analysis.rating_scatterplot(), use_container_width=True)


st.header("Which movies are most frequently rated?")
top_k_rated_movies = st.number_input(
    label="Top k rated movies", min_value=10, max_value=50, value=15, step=5)
st.bar_chart(data=analysis.most_rated_movie(top_k=top_k_rated_movies),
             x='Number of Ratings', y='Movie Title')


st.header("Which movie has the lowest and highest average rating?")
st.caption('Click the header for sorting')
rating_config, rating_data = st.columns([25, 75])
st.write(analysis.rating_stats())


st.markdown('''
`Gypsy` is one the movie with the lowest average rating, but only one person rated it.
 
Similarly `Lamerica` may be one of tthe "highest" rated movie, but it only has 2 ratings. 
            
A better approach for evaluating movie popularity is to do look at the [Bayesian average](https://en.wikipedia.org/wiki/Bayesian_average).           
        ''')

st.write(analysis.ratings_bayesian_avg())
st.markdown("Using the Bayesian average, we see that `Shawshank Redemption`, `The Godfather`, and `The Usual Suspects` are the most highly rated movies. This result makes much more sense since these movies are critically acclaimed films.")


st.header("How many movie genres are there?")
st.pyplot(fig=analysis.genres_count())