import streamlit as st import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import plotly.express as px from PIL import Image def run(): # Create title st.title('IMDb Movie Score Prediction') # Create subheader st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies') # Insert image image = Image.open('imdb.jpeg') st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies') # Create text st.write('This page is written by Brenda') # Make a straight line st.markdown('---') st.write('') # Adds spacing line # Load and show dataframe df = pd.read_csv('movies.csv') st.write('### This is our dataset of previous movies:') st.dataframe(df) st.write('') st.write('') st.write('') # Make a barplot based on user input to view data st.write('### Top N Movies With Highest Scores Based on User Input') option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company')) # Select top N top_n = st.selectbox('Select Top N', (10, 20, 30, 40)) # Calculate mean score based on selected column mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False) top_n_df = mean_scores.head(top_n).reset_index() top_n_df.columns = [option, 'mean_score'] # Plot a barplot of top N mean movie scores based on option fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax) ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores') ax.set_xlabel(option.capitalize()) ax.set_ylabel('Mean Score') ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') plt.tight_layout() st.pyplot(fig) # Additional information: name, director, writer, genre, star, country, company vs IMDb score if option == 'name': max_score = df['score'].max() movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0] min_score = df['score'].min() movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0] st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.") st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.") elif option == 'director': mean_scores_by_director = df.groupby('director')['score'].mean() max_score = mean_scores_by_director.max() director_with_max_score = df[df['score'] == max_score]['director'].iloc[0] min_score = mean_scores_by_director.min() director_with_min_score = df[df['score'] == min_score]['director'].iloc[0] st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.") st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.") elif option == 'writer': mean_scores_by_writer = df.groupby('writer')['score'].mean() max_score = mean_scores_by_writer.max() writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0] min_score = mean_scores_by_writer.min() writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0] st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.") st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.") elif option == 'genre': mean_scores_by_genre = df.groupby('genre')['score'].mean() max_score = mean_scores_by_genre.max() genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score] if not genre_with_max_score_df.empty: genre_with_max_score = genre_with_max_score_df.index[0] st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.") else: st.write("No genre found with the highest mean score.") min_score = mean_scores_by_genre.min() genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score] if not genre_with_min_score_df.empty: genre_with_min_score = genre_with_min_score_df.index[0] st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.") else: st.write("No genre found with the lowest mean score.") st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.") st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.") elif option == 'star': mean_scores_by_star = df.groupby('star')['score'].mean() max_score = mean_scores_by_star.max() star_with_max_score = df[df['score'] == max_score]['star'].iloc[0] min_score = mean_scores_by_star.min() star_with_min_score = df[df['score'] == min_score]['star'].iloc[0] st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.") st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.") elif option == 'country': mean_scores_by_country = df.groupby('country')['score'].mean() max_score = mean_scores_by_country.max() country_with_max_score = df[df['score'] == max_score]['country'].iloc[0] min_score = mean_scores_by_country.min() country_with_min_score = df[df['score'] == min_score]['country'].iloc[0] st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.") st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.") elif option == 'company': mean_scores_by_company = df.groupby('company')['score'].mean() max_score = mean_scores_by_company.max() company_with_max_score = df[df['score'] == max_score]['company'].iloc[0] min_score = mean_scores_by_company.min() company_with_min_score = df[df['score'] == min_score]['company'].iloc[0] st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.") st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.") st.write('') st.write('') st.write('') # Make a scatterplot with regression line to display IMDb Score vs Gross Revenue st.write('### IMDb Score vs Gross Revenue') # Plot scatterplot with regression line (score vs gross) fig = px.scatter( df, x='gross', y='score', hover_data=['name', 'score', 'gross'], # hover over data point labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'}, title='IMDb Score vs Gross Revenue', trendline='ols', # add regression line trendline_color_override='red' ) st.plotly_chart(fig) # Additional information: gross revenue vs IMDb score max_score = df['score'].max() movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0] movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0] max_gross = df['gross'].max() movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0] movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0] st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.") st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.") st.write('') st.write('') st.write('') # Make a scatterplot with regression line to display IMDb Score vs Runtime st.write('### IMDb Score vs Movie Runtime') # Plot scatterplot with regression line (score vs runtime) fig = px.scatter( df, x='runtime', y='score', hover_data=['name', 'score', 'runtime'], # hover over data point labels={'runtime': 'Runtime', 'score': 'IMDb Score'}, title='IMDb Score vs Runtime', trendline='ols', # add regression line trendline_color_override='red' ) st.plotly_chart(fig) # Additional information: runtime vs IMDb score max_score = df['score'].max() movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0] movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0] max_runtime = df['runtime'].max() movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0] movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0] st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.") st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.") st.write('') st.write('') st.write('') # Scatterplot of Budget vs IMDb score with Regression Line st.write('### IMDb Score vs Budget') # Minimum and maximum budget calculated to determine the range of the slider for the budget min_budget = int(df['budget'].min()) max_budget = int(df['budget'].max()) selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget)) # Filter dataframe based on budget range selected by the user df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])] # Plot a scatterplot with regression line of budget vs score fig = px.scatter( df_filtered, x='budget', y='score', hover_data=['name', 'score', 'budget'], # hover over data point labels={'budget': 'Budget', 'score': 'IMDb Score'}, title='IMDb Score vs Budget', trendline='ols', # add regression line trendline_color_override='red' ) st.plotly_chart(fig) if __name__ == '__main__': run()