Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| from PIL import Image | |
| def run(): | |
| # Create title | |
| st.title('IMDb Movie Score Prediction') | |
| # Create subheader | |
| st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies') | |
| # Insert image | |
| image = Image.open('imdb.jpeg') | |
| st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies') | |
| # Create text | |
| st.write('This page is written by Brenda') | |
| # Make a straight line | |
| st.markdown('---') | |
| st.write('') # Adds spacing line | |
| # Load and show dataframe | |
| df = pd.read_csv('movies.csv') | |
| st.write('### This is our dataset of previous movies:') | |
| st.dataframe(df) | |
| st.write('') | |
| st.write('') | |
| st.write('') | |
| # Make a barplot based on user input to view data | |
| st.write('### Top N Movies With Highest Scores Based on User Input') | |
| option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company')) | |
| # Select top N | |
| top_n = st.selectbox('Select Top N', (10, 20, 30, 40)) | |
| # Calculate mean score based on selected column | |
| mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False) | |
| top_n_df = mean_scores.head(top_n).reset_index() | |
| top_n_df.columns = [option, 'mean_score'] | |
| # Plot a barplot of top N mean movie scores based on option | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax) | |
| ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores') | |
| ax.set_xlabel(option.capitalize()) | |
| ax.set_ylabel('Mean Score') | |
| ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| # Additional information: name, director, writer, genre, star, country, company vs IMDb score | |
| if option == 'name': | |
| max_score = df['score'].max() | |
| movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0] | |
| min_score = df['score'].min() | |
| movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0] | |
| st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.") | |
| elif option == 'director': | |
| mean_scores_by_director = df.groupby('director')['score'].mean() | |
| max_score = mean_scores_by_director.max() | |
| director_with_max_score = df[df['score'] == max_score]['director'].iloc[0] | |
| min_score = mean_scores_by_director.min() | |
| director_with_min_score = df[df['score'] == min_score]['director'].iloc[0] | |
| st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.") | |
| elif option == 'writer': | |
| mean_scores_by_writer = df.groupby('writer')['score'].mean() | |
| max_score = mean_scores_by_writer.max() | |
| writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0] | |
| min_score = mean_scores_by_writer.min() | |
| writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0] | |
| st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.") | |
| elif option == 'genre': | |
| mean_scores_by_genre = df.groupby('genre')['score'].mean() | |
| max_score = mean_scores_by_genre.max() | |
| genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score] | |
| if not genre_with_max_score_df.empty: | |
| genre_with_max_score = genre_with_max_score_df.index[0] | |
| st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.") | |
| else: | |
| st.write("No genre found with the highest mean score.") | |
| min_score = mean_scores_by_genre.min() | |
| genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score] | |
| if not genre_with_min_score_df.empty: | |
| genre_with_min_score = genre_with_min_score_df.index[0] | |
| st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.") | |
| else: | |
| st.write("No genre found with the lowest mean score.") | |
| st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.") | |
| elif option == 'star': | |
| mean_scores_by_star = df.groupby('star')['score'].mean() | |
| max_score = mean_scores_by_star.max() | |
| star_with_max_score = df[df['score'] == max_score]['star'].iloc[0] | |
| min_score = mean_scores_by_star.min() | |
| star_with_min_score = df[df['score'] == min_score]['star'].iloc[0] | |
| st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.") | |
| elif option == 'country': | |
| mean_scores_by_country = df.groupby('country')['score'].mean() | |
| max_score = mean_scores_by_country.max() | |
| country_with_max_score = df[df['score'] == max_score]['country'].iloc[0] | |
| min_score = mean_scores_by_country.min() | |
| country_with_min_score = df[df['score'] == min_score]['country'].iloc[0] | |
| st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.") | |
| elif option == 'company': | |
| mean_scores_by_company = df.groupby('company')['score'].mean() | |
| max_score = mean_scores_by_company.max() | |
| company_with_max_score = df[df['score'] == max_score]['company'].iloc[0] | |
| min_score = mean_scores_by_company.min() | |
| company_with_min_score = df[df['score'] == min_score]['company'].iloc[0] | |
| st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.") | |
| st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.") | |
| st.write('') | |
| st.write('') | |
| st.write('') | |
| # Make a scatterplot with regression line to display IMDb Score vs Gross Revenue | |
| st.write('### IMDb Score vs Gross Revenue') | |
| # Plot scatterplot with regression line (score vs gross) | |
| fig = px.scatter( | |
| df, | |
| x='gross', | |
| y='score', | |
| hover_data=['name', 'score', 'gross'], # hover over data point | |
| labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'}, | |
| title='IMDb Score vs Gross Revenue', | |
| trendline='ols', # add regression line | |
| trendline_color_override='red' | |
| ) | |
| st.plotly_chart(fig) | |
| # Additional information: gross revenue vs IMDb score | |
| max_score = df['score'].max() | |
| movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0] | |
| movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0] | |
| max_gross = df['gross'].max() | |
| movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0] | |
| movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0] | |
| st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.") | |
| st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.") | |
| st.write('') | |
| st.write('') | |
| st.write('') | |
| # Make a scatterplot with regression line to display IMDb Score vs Runtime | |
| st.write('### IMDb Score vs Movie Runtime') | |
| # Plot scatterplot with regression line (score vs runtime) | |
| fig = px.scatter( | |
| df, | |
| x='runtime', | |
| y='score', | |
| hover_data=['name', 'score', 'runtime'], # hover over data point | |
| labels={'runtime': 'Runtime', 'score': 'IMDb Score'}, | |
| title='IMDb Score vs Runtime', | |
| trendline='ols', # add regression line | |
| trendline_color_override='red' | |
| ) | |
| st.plotly_chart(fig) | |
| # Additional information: runtime vs IMDb score | |
| max_score = df['score'].max() | |
| movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0] | |
| movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0] | |
| max_runtime = df['runtime'].max() | |
| movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0] | |
| movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0] | |
| st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.") | |
| st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.") | |
| st.write('') | |
| st.write('') | |
| st.write('') | |
| # Scatterplot of Budget vs IMDb score with Regression Line | |
| st.write('### IMDb Score vs Budget') | |
| # Minimum and maximum budget calculated to determine the range of the slider for the budget | |
| min_budget = int(df['budget'].min()) | |
| max_budget = int(df['budget'].max()) | |
| selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget)) | |
| # Filter dataframe based on budget range selected by the user | |
| df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])] | |
| # Plot a scatterplot with regression line of budget vs score | |
| fig = px.scatter( | |
| df_filtered, | |
| x='budget', | |
| y='score', | |
| hover_data=['name', 'score', 'budget'], # hover over data point | |
| labels={'budget': 'Budget', 'score': 'IMDb Score'}, | |
| title='IMDb Score vs Budget', | |
| trendline='ols', # add regression line | |
| trendline_color_override='red' | |
| ) | |
| st.plotly_chart(fig) | |
| if __name__ == '__main__': | |
| run() | |