Spaces:
Sleeping
Sleeping
File size: 10,779 Bytes
047a1eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image
def run():
# Create title
st.title('IMDb Movie Score Prediction')
# Create subheader
st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies')
# Insert image
image = Image.open('imdb.jpeg')
st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies')
# Create text
st.write('This page is written by Brenda')
# Make a straight line
st.markdown('---')
st.write('') # Adds spacing line
# Load and show dataframe
df = pd.read_csv('movies.csv')
st.write('### This is our dataset of previous movies:')
st.dataframe(df)
st.write('')
st.write('')
st.write('')
# Make a barplot based on user input to view data
st.write('### Top N Movies With Highest Scores Based on User Input')
option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company'))
# Select top N
top_n = st.selectbox('Select Top N', (10, 20, 30, 40))
# Calculate mean score based on selected column
mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False)
top_n_df = mean_scores.head(top_n).reset_index()
top_n_df.columns = [option, 'mean_score']
# Plot a barplot of top N mean movie scores based on option
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax)
ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores')
ax.set_xlabel(option.capitalize())
ax.set_ylabel('Mean Score')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()
st.pyplot(fig)
# Additional information: name, director, writer, genre, star, country, company vs IMDb score
if option == 'name':
max_score = df['score'].max()
movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
min_score = df['score'].min()
movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0]
st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.")
st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.")
elif option == 'director':
mean_scores_by_director = df.groupby('director')['score'].mean()
max_score = mean_scores_by_director.max()
director_with_max_score = df[df['score'] == max_score]['director'].iloc[0]
min_score = mean_scores_by_director.min()
director_with_min_score = df[df['score'] == min_score]['director'].iloc[0]
st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.")
st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.")
elif option == 'writer':
mean_scores_by_writer = df.groupby('writer')['score'].mean()
max_score = mean_scores_by_writer.max()
writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0]
min_score = mean_scores_by_writer.min()
writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0]
st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.")
st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.")
elif option == 'genre':
mean_scores_by_genre = df.groupby('genre')['score'].mean()
max_score = mean_scores_by_genre.max()
genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score]
if not genre_with_max_score_df.empty:
genre_with_max_score = genre_with_max_score_df.index[0]
st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
else:
st.write("No genre found with the highest mean score.")
min_score = mean_scores_by_genre.min()
genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score]
if not genre_with_min_score_df.empty:
genre_with_min_score = genre_with_min_score_df.index[0]
st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
else:
st.write("No genre found with the lowest mean score.")
st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
elif option == 'star':
mean_scores_by_star = df.groupby('star')['score'].mean()
max_score = mean_scores_by_star.max()
star_with_max_score = df[df['score'] == max_score]['star'].iloc[0]
min_score = mean_scores_by_star.min()
star_with_min_score = df[df['score'] == min_score]['star'].iloc[0]
st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.")
st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.")
elif option == 'country':
mean_scores_by_country = df.groupby('country')['score'].mean()
max_score = mean_scores_by_country.max()
country_with_max_score = df[df['score'] == max_score]['country'].iloc[0]
min_score = mean_scores_by_country.min()
country_with_min_score = df[df['score'] == min_score]['country'].iloc[0]
st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.")
st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.")
elif option == 'company':
mean_scores_by_company = df.groupby('company')['score'].mean()
max_score = mean_scores_by_company.max()
company_with_max_score = df[df['score'] == max_score]['company'].iloc[0]
min_score = mean_scores_by_company.min()
company_with_min_score = df[df['score'] == min_score]['company'].iloc[0]
st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.")
st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.")
st.write('')
st.write('')
st.write('')
# Make a scatterplot with regression line to display IMDb Score vs Gross Revenue
st.write('### IMDb Score vs Gross Revenue')
# Plot scatterplot with regression line (score vs gross)
fig = px.scatter(
df,
x='gross',
y='score',
hover_data=['name', 'score', 'gross'], # hover over data point
labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'},
title='IMDb Score vs Gross Revenue',
trendline='ols', # add regression line
trendline_color_override='red'
)
st.plotly_chart(fig)
# Additional information: gross revenue vs IMDb score
max_score = df['score'].max()
movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0]
max_gross = df['gross'].max()
movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0]
movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0]
st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.")
st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.")
st.write('')
st.write('')
st.write('')
# Make a scatterplot with regression line to display IMDb Score vs Runtime
st.write('### IMDb Score vs Movie Runtime')
# Plot scatterplot with regression line (score vs runtime)
fig = px.scatter(
df,
x='runtime',
y='score',
hover_data=['name', 'score', 'runtime'], # hover over data point
labels={'runtime': 'Runtime', 'score': 'IMDb Score'},
title='IMDb Score vs Runtime',
trendline='ols', # add regression line
trendline_color_override='red'
)
st.plotly_chart(fig)
# Additional information: runtime vs IMDb score
max_score = df['score'].max()
movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0]
max_runtime = df['runtime'].max()
movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0]
movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0]
st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.")
st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.")
st.write('')
st.write('')
st.write('')
# Scatterplot of Budget vs IMDb score with Regression Line
st.write('### IMDb Score vs Budget')
# Minimum and maximum budget calculated to determine the range of the slider for the budget
min_budget = int(df['budget'].min())
max_budget = int(df['budget'].max())
selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget))
# Filter dataframe based on budget range selected by the user
df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])]
# Plot a scatterplot with regression line of budget vs score
fig = px.scatter(
df_filtered,
x='budget',
y='score',
hover_data=['name', 'score', 'budget'], # hover over data point
labels={'budget': 'Budget', 'score': 'IMDb Score'},
title='IMDb Score vs Budget',
trendline='ols', # add regression line
trendline_color_override='red'
)
st.plotly_chart(fig)
if __name__ == '__main__':
run()
|