Spaces:

notbeekay
/

Milestone2_Deployment

Sleeping

File size: 10,779 Bytes

047a1eb

import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image

def run():

    # Create title
    st.title('IMDb Movie Score Prediction')

    # Create subheader
    st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies')

    # Insert image 
    image = Image.open('imdb.jpeg')
    st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies')

    # Create text
    st.write('This page is written by Brenda')

    # Make a straight line
    st.markdown('---')
    st.write('')  # Adds spacing line

# Load and show dataframe
    df = pd.read_csv('movies.csv')
    st.write('### This is our dataset of previous movies:')
    st.dataframe(df)
    st.write('')
    st.write('')
    st.write('')

# Make a barplot based on user input to view data
    st.write('### Top N Movies With Highest Scores Based on User Input')
    option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company'))
    # Select top N
    top_n = st.selectbox('Select Top N', (10, 20, 30, 40))
    # Calculate mean score based on selected column
    mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False)
    top_n_df = mean_scores.head(top_n).reset_index()
    top_n_df.columns = [option, 'mean_score']
    # Plot a barplot of top N mean movie scores based on option
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax)
    ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores')
    ax.set_xlabel(option.capitalize())
    ax.set_ylabel('Mean Score')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    plt.tight_layout()
    st.pyplot(fig)
    # Additional information: name, director, writer, genre, star, country, company vs IMDb score
    if option == 'name':
        max_score = df['score'].max()
        movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
        min_score = df['score'].min()
        movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0]
        st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.")
    elif option == 'director':
        mean_scores_by_director = df.groupby('director')['score'].mean()
        max_score = mean_scores_by_director.max()
        director_with_max_score = df[df['score'] == max_score]['director'].iloc[0]
        min_score = mean_scores_by_director.min()
        director_with_min_score = df[df['score'] == min_score]['director'].iloc[0]
        st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.")
    elif option == 'writer':
        mean_scores_by_writer = df.groupby('writer')['score'].mean()
        max_score = mean_scores_by_writer.max()
        writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0]
        min_score = mean_scores_by_writer.min()
        writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0]
        st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.")
    elif option == 'genre':
        mean_scores_by_genre = df.groupby('genre')['score'].mean()
        max_score = mean_scores_by_genre.max()
        genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score]
        if not genre_with_max_score_df.empty:
            genre_with_max_score = genre_with_max_score_df.index[0]
            st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
        else:
            st.write("No genre found with the highest mean score.")

        min_score = mean_scores_by_genre.min()
        genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score]
        if not genre_with_min_score_df.empty:
            genre_with_min_score = genre_with_min_score_df.index[0]
            st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
        else:
            st.write("No genre found with the lowest mean score.")
            st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
            st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
    elif option == 'star':
        mean_scores_by_star = df.groupby('star')['score'].mean()
        max_score = mean_scores_by_star.max()
        star_with_max_score = df[df['score'] == max_score]['star'].iloc[0]
        min_score = mean_scores_by_star.min()
        star_with_min_score = df[df['score'] == min_score]['star'].iloc[0]
        st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.")
    elif option == 'country':
        mean_scores_by_country = df.groupby('country')['score'].mean()
        max_score = mean_scores_by_country.max()
        country_with_max_score = df[df['score'] == max_score]['country'].iloc[0]
        min_score = mean_scores_by_country.min()
        country_with_min_score = df[df['score'] == min_score]['country'].iloc[0]
        st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.")
    elif option == 'company':
        mean_scores_by_company = df.groupby('company')['score'].mean()
        max_score = mean_scores_by_company.max()
        company_with_max_score = df[df['score'] == max_score]['company'].iloc[0]
        min_score = mean_scores_by_company.min()
        company_with_min_score = df[df['score'] == min_score]['company'].iloc[0]
        st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.")
    st.write('')
    st.write('')
    st.write('')

# Make a scatterplot with regression line to display IMDb Score vs Gross Revenue
    st.write('### IMDb Score vs Gross Revenue')
    # Plot scatterplot with regression line (score vs gross)
    fig = px.scatter(
        df, 
        x='gross', 
        y='score', 
        hover_data=['name', 'score', 'gross'],  # hover over data point
        labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'},
        title='IMDb Score vs Gross Revenue',
        trendline='ols',  # add regression line
        trendline_color_override='red'
    )
    st.plotly_chart(fig)

    # Additional information: gross revenue vs IMDb score
    max_score = df['score'].max()
    movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
    movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0]
    max_gross = df['gross'].max()
    movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0]
    movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0]
    st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.")
    st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.")
    st.write('')
    st.write('')
    st.write('')


# Make a scatterplot with regression line to display IMDb Score vs Runtime
    st.write('### IMDb Score vs Movie Runtime')
    # Plot scatterplot with regression line (score vs runtime)
    fig = px.scatter(
        df, 
        x='runtime', 
        y='score', 
        hover_data=['name', 'score', 'runtime'],  # hover over data point
        labels={'runtime': 'Runtime', 'score': 'IMDb Score'},
        title='IMDb Score vs Runtime',
        trendline='ols', # add regression line
        trendline_color_override='red'
    )
    st.plotly_chart(fig)

    # Additional information: runtime vs IMDb score
    max_score = df['score'].max()
    movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
    movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0]
    max_runtime = df['runtime'].max()
    movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0]
    movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0]
    st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.")
    st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.")
    st.write('')
    st.write('')
    st.write('')


# Scatterplot of Budget vs IMDb score with Regression Line
    st.write('### IMDb Score vs Budget')
    # Minimum and maximum budget calculated to determine the range of the slider for the budget
    min_budget = int(df['budget'].min())
    max_budget = int(df['budget'].max())
    selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget))
    # Filter dataframe based on budget range selected by the user
    df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])]
    
    # Plot a scatterplot with regression line of budget vs score
    fig = px.scatter(
        df_filtered, 
        x='budget', 
        y='score', 
        hover_data=['name', 'score', 'budget'],  # hover over data point
        labels={'budget': 'Budget', 'score': 'IMDb Score'},
        title='IMDb Score vs Budget',
        trendline='ols', # add regression line
        trendline_color_override='red'
    )
    st.plotly_chart(fig)

if __name__ == '__main__':
    run()