notbeekay's picture
Upload 9 files
047a1eb verified
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
def run():
# Create title
st.title('IMDb Movie Score Prediction')
# Create subheader
st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies')
# Insert image
image = Image.open('imdb.jpeg')
st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies')
# Create text
st.write('This page is written by Brenda')
# Make a straight line
st.markdown('---')
st.write('') # Adds spacing line
# Load and show dataframe
df = pd.read_csv('movies.csv')
st.write('### This is our dataset of previous movies:')
st.dataframe(df)
st.write('')
st.write('')
st.write('')
# Make a barplot based on user input to view data
st.write('### Top N Movies With Highest Scores Based on User Input')
option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company'))
# Select top N
top_n = st.selectbox('Select Top N', (10, 20, 30, 40))
# Calculate mean score based on selected column
mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False)
top_n_df = mean_scores.head(top_n).reset_index()
top_n_df.columns = [option, 'mean_score']
# Plot a barplot of top N mean movie scores based on option
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax)
ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores')
ax.set_xlabel(option.capitalize())
ax.set_ylabel('Mean Score')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()
st.pyplot(fig)
# Additional information: name, director, writer, genre, star, country, company vs IMDb score
if option == 'name':
max_score = df['score'].max()
movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
min_score = df['score'].min()
movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0]
st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.")
st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.")
elif option == 'director':
mean_scores_by_director = df.groupby('director')['score'].mean()
max_score = mean_scores_by_director.max()
director_with_max_score = df[df['score'] == max_score]['director'].iloc[0]
min_score = mean_scores_by_director.min()
director_with_min_score = df[df['score'] == min_score]['director'].iloc[0]
st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.")
st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.")
elif option == 'writer':
mean_scores_by_writer = df.groupby('writer')['score'].mean()
max_score = mean_scores_by_writer.max()
writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0]
min_score = mean_scores_by_writer.min()
writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0]
st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.")
st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.")
elif option == 'genre':
mean_scores_by_genre = df.groupby('genre')['score'].mean()
max_score = mean_scores_by_genre.max()
genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score]
if not genre_with_max_score_df.empty:
genre_with_max_score = genre_with_max_score_df.index[0]
st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
else:
st.write("No genre found with the highest mean score.")
min_score = mean_scores_by_genre.min()
genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score]
if not genre_with_min_score_df.empty:
genre_with_min_score = genre_with_min_score_df.index[0]
st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
else:
st.write("No genre found with the lowest mean score.")
st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
elif option == 'star':
mean_scores_by_star = df.groupby('star')['score'].mean()
max_score = mean_scores_by_star.max()
star_with_max_score = df[df['score'] == max_score]['star'].iloc[0]
min_score = mean_scores_by_star.min()
star_with_min_score = df[df['score'] == min_score]['star'].iloc[0]
st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.")
st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.")
elif option == 'country':
mean_scores_by_country = df.groupby('country')['score'].mean()
max_score = mean_scores_by_country.max()
country_with_max_score = df[df['score'] == max_score]['country'].iloc[0]
min_score = mean_scores_by_country.min()
country_with_min_score = df[df['score'] == min_score]['country'].iloc[0]
st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.")
st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.")
elif option == 'company':
mean_scores_by_company = df.groupby('company')['score'].mean()
max_score = mean_scores_by_company.max()
company_with_max_score = df[df['score'] == max_score]['company'].iloc[0]
min_score = mean_scores_by_company.min()
company_with_min_score = df[df['score'] == min_score]['company'].iloc[0]
st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.")
st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.")
st.write('')
st.write('')
st.write('')
# Make a scatterplot with regression line to display IMDb Score vs Gross Revenue
st.write('### IMDb Score vs Gross Revenue')
# Plot scatterplot with regression line (score vs gross)
fig = px.scatter(
df,
x='gross',
y='score',
hover_data=['name', 'score', 'gross'], # hover over data point
labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'},
title='IMDb Score vs Gross Revenue',
trendline='ols', # add regression line
trendline_color_override='red'
)
st.plotly_chart(fig)
# Additional information: gross revenue vs IMDb score
max_score = df['score'].max()
movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0]
max_gross = df['gross'].max()
movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0]
movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0]
st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.")
st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.")
st.write('')
st.write('')
st.write('')
# Make a scatterplot with regression line to display IMDb Score vs Runtime
st.write('### IMDb Score vs Movie Runtime')
# Plot scatterplot with regression line (score vs runtime)
fig = px.scatter(
df,
x='runtime',
y='score',
hover_data=['name', 'score', 'runtime'], # hover over data point
labels={'runtime': 'Runtime', 'score': 'IMDb Score'},
title='IMDb Score vs Runtime',
trendline='ols', # add regression line
trendline_color_override='red'
)
st.plotly_chart(fig)
# Additional information: runtime vs IMDb score
max_score = df['score'].max()
movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0]
max_runtime = df['runtime'].max()
movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0]
movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0]
st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.")
st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.")
st.write('')
st.write('')
st.write('')
# Scatterplot of Budget vs IMDb score with Regression Line
st.write('### IMDb Score vs Budget')
# Minimum and maximum budget calculated to determine the range of the slider for the budget
min_budget = int(df['budget'].min())
max_budget = int(df['budget'].max())
selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget))
# Filter dataframe based on budget range selected by the user
df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])]
# Plot a scatterplot with regression line of budget vs score
fig = px.scatter(
df_filtered,
x='budget',
y='score',
hover_data=['name', 'score', 'budget'], # hover over data point
labels={'budget': 'Budget', 'score': 'IMDb Score'},
title='IMDb Score vs Budget',
trendline='ols', # add regression line
trendline_color_override='red'
)
st.plotly_chart(fig)
if __name__ == '__main__':
run()