File size: 10,779 Bytes
047a1eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from PIL import Image

def run():

    # Create title
    st.title('IMDb Movie Score Prediction')

    # Create subheader
    st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies')

    # Insert image 
    image = Image.open('imdb.jpeg')
    st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies')

    # Create text
    st.write('This page is written by Brenda')

    # Make a straight line
    st.markdown('---')
    st.write('')  # Adds spacing line

# Load and show dataframe
    df = pd.read_csv('movies.csv')
    st.write('### This is our dataset of previous movies:')
    st.dataframe(df)
    st.write('')
    st.write('')
    st.write('')

# Make a barplot based on user input to view data
    st.write('### Top N Movies With Highest Scores Based on User Input')
    option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company'))
    # Select top N
    top_n = st.selectbox('Select Top N', (10, 20, 30, 40))
    # Calculate mean score based on selected column
    mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False)
    top_n_df = mean_scores.head(top_n).reset_index()
    top_n_df.columns = [option, 'mean_score']
    # Plot a barplot of top N mean movie scores based on option
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax)
    ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores')
    ax.set_xlabel(option.capitalize())
    ax.set_ylabel('Mean Score')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    plt.tight_layout()
    st.pyplot(fig)
    # Additional information: name, director, writer, genre, star, country, company vs IMDb score
    if option == 'name':
        max_score = df['score'].max()
        movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
        min_score = df['score'].min()
        movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0]
        st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.")
    elif option == 'director':
        mean_scores_by_director = df.groupby('director')['score'].mean()
        max_score = mean_scores_by_director.max()
        director_with_max_score = df[df['score'] == max_score]['director'].iloc[0]
        min_score = mean_scores_by_director.min()
        director_with_min_score = df[df['score'] == min_score]['director'].iloc[0]
        st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.")
    elif option == 'writer':
        mean_scores_by_writer = df.groupby('writer')['score'].mean()
        max_score = mean_scores_by_writer.max()
        writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0]
        min_score = mean_scores_by_writer.min()
        writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0]
        st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.")
    elif option == 'genre':
        mean_scores_by_genre = df.groupby('genre')['score'].mean()
        max_score = mean_scores_by_genre.max()
        genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score]
        if not genre_with_max_score_df.empty:
            genre_with_max_score = genre_with_max_score_df.index[0]
            st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
        else:
            st.write("No genre found with the highest mean score.")

        min_score = mean_scores_by_genre.min()
        genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score]
        if not genre_with_min_score_df.empty:
            genre_with_min_score = genre_with_min_score_df.index[0]
            st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
        else:
            st.write("No genre found with the lowest mean score.")
            st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
            st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
    elif option == 'star':
        mean_scores_by_star = df.groupby('star')['score'].mean()
        max_score = mean_scores_by_star.max()
        star_with_max_score = df[df['score'] == max_score]['star'].iloc[0]
        min_score = mean_scores_by_star.min()
        star_with_min_score = df[df['score'] == min_score]['star'].iloc[0]
        st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.")
    elif option == 'country':
        mean_scores_by_country = df.groupby('country')['score'].mean()
        max_score = mean_scores_by_country.max()
        country_with_max_score = df[df['score'] == max_score]['country'].iloc[0]
        min_score = mean_scores_by_country.min()
        country_with_min_score = df[df['score'] == min_score]['country'].iloc[0]
        st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.")
    elif option == 'company':
        mean_scores_by_company = df.groupby('company')['score'].mean()
        max_score = mean_scores_by_company.max()
        company_with_max_score = df[df['score'] == max_score]['company'].iloc[0]
        min_score = mean_scores_by_company.min()
        company_with_min_score = df[df['score'] == min_score]['company'].iloc[0]
        st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.")
        st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.")
    st.write('')
    st.write('')
    st.write('')

# Make a scatterplot with regression line to display IMDb Score vs Gross Revenue
    st.write('### IMDb Score vs Gross Revenue')
    # Plot scatterplot with regression line (score vs gross)
    fig = px.scatter(
        df, 
        x='gross', 
        y='score', 
        hover_data=['name', 'score', 'gross'],  # hover over data point
        labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'},
        title='IMDb Score vs Gross Revenue',
        trendline='ols',  # add regression line
        trendline_color_override='red'
    )
    st.plotly_chart(fig)

    # Additional information: gross revenue vs IMDb score
    max_score = df['score'].max()
    movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
    movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0]
    max_gross = df['gross'].max()
    movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0]
    movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0]
    st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.")
    st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.")
    st.write('')
    st.write('')
    st.write('')


# Make a scatterplot with regression line to display IMDb Score vs Runtime
    st.write('### IMDb Score vs Movie Runtime')
    # Plot scatterplot with regression line (score vs runtime)
    fig = px.scatter(
        df, 
        x='runtime', 
        y='score', 
        hover_data=['name', 'score', 'runtime'],  # hover over data point
        labels={'runtime': 'Runtime', 'score': 'IMDb Score'},
        title='IMDb Score vs Runtime',
        trendline='ols', # add regression line
        trendline_color_override='red'
    )
    st.plotly_chart(fig)

    # Additional information: runtime vs IMDb score
    max_score = df['score'].max()
    movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
    movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0]
    max_runtime = df['runtime'].max()
    movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0]
    movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0]
    st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.")
    st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.")
    st.write('')
    st.write('')
    st.write('')


# Scatterplot of Budget vs IMDb score with Regression Line
    st.write('### IMDb Score vs Budget')
    # Minimum and maximum budget calculated to determine the range of the slider for the budget
    min_budget = int(df['budget'].min())
    max_budget = int(df['budget'].max())
    selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget))
    # Filter dataframe based on budget range selected by the user
    df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])]
    
    # Plot a scatterplot with regression line of budget vs score
    fig = px.scatter(
        df_filtered, 
        x='budget', 
        y='score', 
        hover_data=['name', 'score', 'budget'],  # hover over data point
        labels={'budget': 'Budget', 'score': 'IMDb Score'},
        title='IMDb Score vs Budget',
        trendline='ols', # add regression line
        trendline_color_override='red'
    )
    st.plotly_chart(fig)

if __name__ == '__main__':
    run()