Spaces:

notbeekay
/

Milestone2_Deployment

Sleeping

App Files Files Community

Milestone2_Deployment / eda.py

notbeekay

Upload 9 files

047a1eb verified over 1 year ago

raw

history blame contribute delete

10.8 kB

	import streamlit as st
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	import plotly.express as px
	from PIL import Image

	def run():

	# Create title
	st.title('IMDb Movie Score Prediction')

	# Create subheader
	st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies')

	# Insert image
	image = Image.open('imdb.jpeg')
	st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies')

	# Create text
	st.write('This page is written by Brenda')

	# Make a straight line
	st.markdown('---')
	st.write('') # Adds spacing line

	# Load and show dataframe
	df = pd.read_csv('movies.csv')
	st.write('### This is our dataset of previous movies:')
	st.dataframe(df)
	st.write('')
	st.write('')
	st.write('')

	# Make a barplot based on user input to view data
	st.write('### Top N Movies With Highest Scores Based on User Input')
	option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company'))
	# Select top N
	top_n = st.selectbox('Select Top N', (10, 20, 30, 40))
	# Calculate mean score based on selected column
	mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False)
	top_n_df = mean_scores.head(top_n).reset_index()
	top_n_df.columns = [option, 'mean_score']
	# Plot a barplot of top N mean movie scores based on option
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax)
	ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores')
	ax.set_xlabel(option.capitalize())
	ax.set_ylabel('Mean Score')
	ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
	plt.tight_layout()
	st.pyplot(fig)
	# Additional information: name, director, writer, genre, star, country, company vs IMDb score
	if option == 'name':
	max_score = df['score'].max()
	movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
	min_score = df['score'].min()
	movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0]
	st.write(f"The movie with the highest IMDb score is: {movie_with_max_score} with a score of {max_score}.")
	st.write(f"The movie with the lowest IMDb score is: {movie_with_min_score} with a score of {min_score}.")
	elif option == 'director':
	mean_scores_by_director = df.groupby('director')['score'].mean()
	max_score = mean_scores_by_director.max()
	director_with_max_score = df[df['score'] == max_score]['director'].iloc[0]
	min_score = mean_scores_by_director.min()
	director_with_min_score = df[df['score'] == min_score]['director'].iloc[0]
	st.write(f"The director with the highest mean IMDb score is: {director_with_max_score} with a score of {max_score}.")
	st.write(f"The director with the lowest mean IMDb score is: {director_with_min_score} with a score of {min_score}.")
	elif option == 'writer':
	mean_scores_by_writer = df.groupby('writer')['score'].mean()
	max_score = mean_scores_by_writer.max()
	writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0]
	min_score = mean_scores_by_writer.min()
	writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0]
	st.write(f"The movie with the highest mean IMDb score is: {writer_with_max_score} with a score of {max_score}.")
	st.write(f"The movie with the lowest mean IMDb score is: {writer_with_min_score} with a score of {min_score}.")
	elif option == 'genre':
	mean_scores_by_genre = df.groupby('genre')['score'].mean()
	max_score = mean_scores_by_genre.max()
	genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score]
	if not genre_with_max_score_df.empty:
	genre_with_max_score = genre_with_max_score_df.index[0]
	st.write(f"The genre with the highest mean IMDb score is: {genre_with_max_score} with a score of {max_score}.")
	else:
	st.write("No genre found with the highest mean score.")

	min_score = mean_scores_by_genre.min()
	genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score]
	if not genre_with_min_score_df.empty:
	genre_with_min_score = genre_with_min_score_df.index[0]
	st.write(f"The genre with the lowest mean IMDb score is: {genre_with_min_score} with a score of {min_score}.")
	else:
	st.write("No genre found with the lowest mean score.")
	st.write(f"The genre with the highest mean IMDb score is: {genre_with_max_score} with a score of {max_score}.")
	st.write(f"The genre with the lowest mean IMDb score is: {genre_with_min_score} with a score of {min_score}.")
	elif option == 'star':
	mean_scores_by_star = df.groupby('star')['score'].mean()
	max_score = mean_scores_by_star.max()
	star_with_max_score = df[df['score'] == max_score]['star'].iloc[0]
	min_score = mean_scores_by_star.min()
	star_with_min_score = df[df['score'] == min_score]['star'].iloc[0]
	st.write(f"The star with the highest mean IMDb score is: {star_with_max_score} with a score of {max_score}.")
	st.write(f"The star with the lowest mean IMDb score is: {star_with_min_score} with a score of {min_score}.")
	elif option == 'country':
	mean_scores_by_country = df.groupby('country')['score'].mean()
	max_score = mean_scores_by_country.max()
	country_with_max_score = df[df['score'] == max_score]['country'].iloc[0]
	min_score = mean_scores_by_country.min()
	country_with_min_score = df[df['score'] == min_score]['country'].iloc[0]
	st.write(f"The country with the highest mean IMDb score is: {country_with_max_score} with a score of {max_score}.")
	st.write(f"The country with the lowest mean IMDb score is: {country_with_min_score} with a score of {min_score}.")
	elif option == 'company':
	mean_scores_by_company = df.groupby('company')['score'].mean()
	max_score = mean_scores_by_company.max()
	company_with_max_score = df[df['score'] == max_score]['company'].iloc[0]
	min_score = mean_scores_by_company.min()
	company_with_min_score = df[df['score'] == min_score]['company'].iloc[0]
	st.write(f"The company with the highest mean IMDb score is: {company_with_max_score} with a score of {max_score}.")
	st.write(f"The company with the lowest mean IMDb score is: {company_with_min_score} with a score of {min_score}.")
	st.write('')
	st.write('')
	st.write('')

	# Make a scatterplot with regression line to display IMDb Score vs Gross Revenue
	st.write('### IMDb Score vs Gross Revenue')
	# Plot scatterplot with regression line (score vs gross)
	fig = px.scatter(
	df,
	x='gross',
	y='score',
	hover_data=['name', 'score', 'gross'], # hover over data point
	labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'},
	title='IMDb Score vs Gross Revenue',
	trendline='ols', # add regression line
	trendline_color_override='red'
	)
	st.plotly_chart(fig)

	# Additional information: gross revenue vs IMDb score
	max_score = df['score'].max()
	movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
	movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0]
	max_gross = df['gross'].max()
	movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0]
	movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0]
	st.write(f"The movie with the highest IMDb score is: {movie_with_max_score} with a score of {max_score} and gross revenue of ${movie_with_max_score_gross}.")
	st.write(f"The movie with the highest gross is: {movie_with_max_gross} with a score of {movie_with_max_gross_score} and gross revenue of ${max_gross}.")
	st.write('')
	st.write('')
	st.write('')


	# Make a scatterplot with regression line to display IMDb Score vs Runtime
	st.write('### IMDb Score vs Movie Runtime')
	# Plot scatterplot with regression line (score vs runtime)
	fig = px.scatter(
	df,
	x='runtime',
	y='score',
	hover_data=['name', 'score', 'runtime'], # hover over data point
	labels={'runtime': 'Runtime', 'score': 'IMDb Score'},
	title='IMDb Score vs Runtime',
	trendline='ols', # add regression line
	trendline_color_override='red'
	)
	st.plotly_chart(fig)

	# Additional information: runtime vs IMDb score
	max_score = df['score'].max()
	movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
	movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0]
	max_runtime = df['runtime'].max()
	movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0]
	movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0]
	st.write(f"The movie with the highest IMDb score is: {movie_with_max_score} with a score of {max_score} and runtime of {movie_with_max_score_runtime} minutes.")
	st.write(f"The movie with the highest runtime is: {movie_with_max_runtime} with a score of {movie_with_max_runtime_score} and runtime of {max_runtime} minutes.")
	st.write('')
	st.write('')
	st.write('')


	# Scatterplot of Budget vs IMDb score with Regression Line
	st.write('### IMDb Score vs Budget')
	# Minimum and maximum budget calculated to determine the range of the slider for the budget
	min_budget = int(df['budget'].min())
	max_budget = int(df['budget'].max())
	selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget))
	# Filter dataframe based on budget range selected by the user
	df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])]

	# Plot a scatterplot with regression line of budget vs score
	fig = px.scatter(
	df_filtered,
	x='budget',
	y='score',
	hover_data=['name', 'score', 'budget'], # hover over data point
	labels={'budget': 'Budget', 'score': 'IMDb Score'},
	title='IMDb Score vs Budget',
	trendline='ols', # add regression line
	trendline_color_override='red'
	)
	st.plotly_chart(fig)

	if __name__ == '__main__':
	run()