Spaces:

notbeekay
/

Milestone2_Deployment

Sleeping

App Files Files Community

notbeekay commited on Aug 28, 2024

Commit

047a1eb

verified ·

1 Parent(s): 408de47

Upload 9 files

Browse files

This is a Support Vector Regression (SVR) model with parameters tuned using Random Search (hyperparameter tuning) -- the optimal model determined, with the least errors and highest predictive power compared to other models -- used to help users predict the IMDb scores of movies they are planning to watch, so that they can decide whether to watch them or not.

Files changed (9) hide show

P1M2_brenda_kwan.ipynb +0 -0
P1M2_brenda_kwan_inf.ipynb +213 -0
app.py +10 -0
eda.py +205 -0
imdb.jpeg +0 -0
model_svr.pkl +3 -0
movies.csv +0 -0
prediction.py +70 -0
requirements.txt +9 -0

P1M2_brenda_kwan.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

P1M2_brenda_kwan_inf.ipynb ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Support Vector Regression Model Inference\n",
+    "--- "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data manipulation\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Load model\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Open Model\n",
+    "with open('model_svr.pkl', 'rb') as file_1:\n",
+    "    model_svr = pickle.load(file_1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>rating</th>\n",
+       "      <th>genre</th>\n",
+       "      <th>year</th>\n",
+       "      <th>released</th>\n",
+       "      <th>votes</th>\n",
+       "      <th>director</th>\n",
+       "      <th>writer</th>\n",
+       "      <th>star</th>\n",
+       "      <th>country</th>\n",
+       "      <th>budget</th>\n",
+       "      <th>gross</th>\n",
+       "      <th>company</th>\n",
+       "      <th>runtime</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Oppenheimer</td>\n",
+       "      <td>R</td>\n",
+       "      <td>History</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>July 19, 2023 (United States)</td>\n",
+       "      <td>787446</td>\n",
+       "      <td>Christopher Nolan</td>\n",
+       "      <td>Christopher Nolan</td>\n",
+       "      <td>Cillian Murphy</td>\n",
+       "      <td>United States</td>\n",
+       "      <td>100000000</td>\n",
+       "      <td>958000000</td>\n",
+       "      <td>Universal Pictures</td>\n",
+       "      <td>189</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          name rating    genre  year                       released   votes  \\\n",
+       "0  Oppenheimer      R  History  2023  July 19, 2023 (United States)  787446   \n",
+       "\n",
+       "            director             writer            star        country  \\\n",
+       "0  Christopher Nolan  Christopher Nolan  Cillian Murphy  United States   \n",
+       "\n",
+       "      budget      gross             company  runtime  \n",
+       "0  100000000  958000000  Universal Pictures      189  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Create dataframe\n",
+    "data_inf = pd.DataFrame([{\n",
+    "    'name': 'Oppenheimer', \n",
+    "    'rating': 'R', \n",
+    "    'genre': 'History', \n",
+    "    'year': 2023, \n",
+    "    'released': 'July 19, 2023 (United States)',\n",
+    "    'votes': '787446',\n",
+    "    'director': 'Christopher Nolan',\n",
+    "    'writer': 'Christopher Nolan',\n",
+    "    'star':'Cillian Murphy',\n",
+    "    'country': 'United States',\n",
+    "    'budget':100000000,\n",
+    "    'gross':958000000,\n",
+    "    'company': 'Universal Pictures',\n",
+    "    'runtime':189\n",
+    "}])\n",
+    "data_inf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Make IMDb score prediction using the loaded pipeline\n",
+    "prediction = model_svr.predict(data_inf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[8.07123459]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The Support Vector Regression Model predicts the IMDb score of Oppenheimer to be 8.07/10. This value is very close to the actual IMDb score of the movie which is 8.3/10, indicating that the model has generalised well to the Oppenheimer movie data (unseen data), with a mean absolute error of only 0.23, even smaller than the calculated MAE of the SVR model (0.541)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "phase1",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+import eda
+import prediction
+page = st.sidebar.selectbox('Choose page: ', ('EDA', 'Prediction'))
+if page == 'EDA':
+    eda.run()
+else:
+    prediction.run()

eda.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import streamlit as st
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import plotly.express as px
+from PIL import Image
+def run():
+    # Create title
+    st.title('IMDb Movie Score Prediction')
+    # Create subheader
+    st.subheader('Exploratory Data Analysis (EDA) to Analyse IMDb Scores of Previous Movies')
+    # Insert image
+    image = Image.open('imdb.jpeg')
+    st.image(image, caption = 'This web application analyses IMDb scores of past movies and predicts IMDb scores for future/upcoming movies')
+    # Create text
+    st.write('This page is written by Brenda')
+    # Make a straight line
+    st.markdown('---')
+    st.write('')  # Adds spacing line
+# Load and show dataframe
+    df = pd.read_csv('movies.csv')
+    st.write('### This is our dataset of previous movies:')
+    st.dataframe(df)
+    st.write('')
+    st.write('')
+    st.write('')
+# Make a barplot based on user input to view data
+    st.write('### Top N Movies With Highest Scores Based on User Input')
+    option = st.selectbox('Choose a Column to view the Top N highest-rated mean score', ('name','director', 'writer', 'genre', 'star', 'country', 'company'))
+    # Select top N
+    top_n = st.selectbox('Select Top N', (10, 20, 30, 40))
+    # Calculate mean score based on selected column
+    mean_scores = df.groupby(option)['score'].mean().sort_values(ascending=False)
+    top_n_df = mean_scores.head(top_n).reset_index()
+    top_n_df.columns = [option, 'mean_score']
+    # Plot a barplot of top N mean movie scores based on option
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.barplot(x=option, y='mean_score', data=top_n_df, palette='Blues_d', ax=ax)
+    ax.set_title(f'Top {top_n} {option.capitalize()} with Highest Mean Movie Scores')
+    ax.set_xlabel(option.capitalize())
+    ax.set_ylabel('Mean Score')
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
+    plt.tight_layout()
+    st.pyplot(fig)
+    # Additional information: name, director, writer, genre, star, country, company vs IMDb score
+    if option == 'name':
+        max_score = df['score'].max()
+        movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
+        min_score = df['score'].min()
+        movie_with_min_score= df[df['score'] == min_score]['name'].iloc[0]
+        st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}**.")
+        st.write(f"The movie with the lowest IMDb score is: **{movie_with_min_score}** with a score of **{min_score}**.")
+    elif option == 'director':
+        mean_scores_by_director = df.groupby('director')['score'].mean()
+        max_score = mean_scores_by_director.max()
+        director_with_max_score = df[df['score'] == max_score]['director'].iloc[0]
+        min_score = mean_scores_by_director.min()
+        director_with_min_score = df[df['score'] == min_score]['director'].iloc[0]
+        st.write(f"The director with the highest mean IMDb score is: **{director_with_max_score}** with a score of **{max_score}**.")
+        st.write(f"The director with the lowest mean IMDb score is: **{director_with_min_score}** with a score of **{min_score}**.")
+    elif option == 'writer':
+        mean_scores_by_writer = df.groupby('writer')['score'].mean()
+        max_score = mean_scores_by_writer.max()
+        writer_with_max_score = df[df['score'] == max_score]['writer'].iloc[0]
+        min_score = mean_scores_by_writer.min()
+        writer_with_min_score = df[df['score'] == min_score]['writer'].iloc[0]
+        st.write(f"The movie with the highest mean IMDb score is: **{writer_with_max_score}** with a score of **{max_score}**.")
+        st.write(f"The movie with the lowest mean IMDb score is: **{writer_with_min_score}** with a score of **{min_score}**.")
+    elif option == 'genre':
+        mean_scores_by_genre = df.groupby('genre')['score'].mean()
+        max_score = mean_scores_by_genre.max()
+        genre_with_max_score_df = mean_scores_by_genre[mean_scores_by_genre == max_score]
+        if not genre_with_max_score_df.empty:
+            genre_with_max_score = genre_with_max_score_df.index[0]
+            st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
+        else:
+            st.write("No genre found with the highest mean score.")
+        min_score = mean_scores_by_genre.min()
+        genre_with_min_score_df = mean_scores_by_genre[mean_scores_by_genre == min_score]
+        if not genre_with_min_score_df.empty:
+            genre_with_min_score = genre_with_min_score_df.index[0]
+            st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
+        else:
+            st.write("No genre found with the lowest mean score.")
+            st.write(f"The genre with the highest mean IMDb score is: **{genre_with_max_score}** with a score of **{max_score}**.")
+            st.write(f"The genre with the lowest mean IMDb score is: **{genre_with_min_score}** with a score of **{min_score}**.")
+    elif option == 'star':
+        mean_scores_by_star = df.groupby('star')['score'].mean()
+        max_score = mean_scores_by_star.max()
+        star_with_max_score = df[df['score'] == max_score]['star'].iloc[0]
+        min_score = mean_scores_by_star.min()
+        star_with_min_score = df[df['score'] == min_score]['star'].iloc[0]
+        st.write(f"The star with the highest mean IMDb score is: **{star_with_max_score}** with a score of **{max_score}**.")
+        st.write(f"The star with the lowest mean IMDb score is: **{star_with_min_score}** with a score of **{min_score}**.")
+    elif option == 'country':
+        mean_scores_by_country = df.groupby('country')['score'].mean()
+        max_score = mean_scores_by_country.max()
+        country_with_max_score = df[df['score'] == max_score]['country'].iloc[0]
+        min_score = mean_scores_by_country.min()
+        country_with_min_score = df[df['score'] == min_score]['country'].iloc[0]
+        st.write(f"The country with the highest mean IMDb score is: **{country_with_max_score}** with a score of **{max_score}**.")
+        st.write(f"The country with the lowest mean IMDb score is: **{country_with_min_score}** with a score of **{min_score}**.")
+    elif option == 'company':
+        mean_scores_by_company = df.groupby('company')['score'].mean()
+        max_score = mean_scores_by_company.max()
+        company_with_max_score = df[df['score'] == max_score]['company'].iloc[0]
+        min_score = mean_scores_by_company.min()
+        company_with_min_score = df[df['score'] == min_score]['company'].iloc[0]
+        st.write(f"The company with the highest mean IMDb score is: **{company_with_max_score}** with a score of **{max_score}**.")
+        st.write(f"The company with the lowest mean IMDb score is: **{company_with_min_score}** with a score of **{min_score}**.")
+    st.write('')
+    st.write('')
+    st.write('')
+# Make a scatterplot with regression line to display IMDb Score vs Gross Revenue
+    st.write('### IMDb Score vs Gross Revenue')
+    # Plot scatterplot with regression line (score vs gross)
+    fig = px.scatter(
+        df,
+        x='gross',
+        y='score',
+        hover_data=['name', 'score', 'gross'],  # hover over data point
+        labels={'gross': 'Gross Revenue', 'score': 'IMDb Score'},
+        title='IMDb Score vs Gross Revenue',
+        trendline='ols',  # add regression line
+        trendline_color_override='red'
+    )
+    st.plotly_chart(fig)
+    # Additional information: gross revenue vs IMDb score
+    max_score = df['score'].max()
+    movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
+    movie_with_max_score_gross = df[df['score'] == max_score]['gross'].iloc[0]
+    max_gross = df['gross'].max()
+    movie_with_max_gross = df[df['gross'] == max_gross]['name'].iloc[0]
+    movie_with_max_gross_score = df[df['gross'] == max_gross]['score'].iloc[0]
+    st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and gross revenue of **${movie_with_max_score_gross}**.")
+    st.write(f"The movie with the highest gross is: **{movie_with_max_gross}** with a score of **{movie_with_max_gross_score}** and gross revenue of **${max_gross}**.")
+    st.write('')
+    st.write('')
+    st.write('')
+# Make a scatterplot with regression line to display IMDb Score vs Runtime
+    st.write('### IMDb Score vs Movie Runtime')
+    # Plot scatterplot with regression line (score vs runtime)
+    fig = px.scatter(
+        df,
+        x='runtime',
+        y='score',
+        hover_data=['name', 'score', 'runtime'],  # hover over data point
+        labels={'runtime': 'Runtime', 'score': 'IMDb Score'},
+        title='IMDb Score vs Runtime',
+        trendline='ols', # add regression line
+        trendline_color_override='red'
+    )
+    st.plotly_chart(fig)
+    # Additional information: runtime vs IMDb score
+    max_score = df['score'].max()
+    movie_with_max_score = df[df['score'] == max_score]['name'].iloc[0]
+    movie_with_max_score_runtime = df[df['score'] == max_score]['runtime'].iloc[0]
+    max_runtime = df['runtime'].max()
+    movie_with_max_runtime= df[df['runtime'] == max_runtime]['name'].iloc[0]
+    movie_with_max_runtime_score = df[df['runtime'] == max_runtime]['score'].iloc[0]
+    st.write(f"The movie with the highest IMDb score is: **{movie_with_max_score}** with a score of **{max_score}** and runtime of **{movie_with_max_score_runtime} minutes**.")
+    st.write(f"The movie with the highest runtime is: **{movie_with_max_runtime}** with a score of **{movie_with_max_runtime_score}** and runtime of **{max_runtime} minutes**.")
+    st.write('')
+    st.write('')
+    st.write('')
+# Scatterplot of Budget vs IMDb score with Regression Line
+    st.write('### IMDb Score vs Budget')
+    # Minimum and maximum budget calculated to determine the range of the slider for the budget
+    min_budget = int(df['budget'].min())
+    max_budget = int(df['budget'].max())
+    selected_budget = st.slider('Select Budget Range', min_budget, max_budget, (min_budget, max_budget))
+    # Filter dataframe based on budget range selected by the user
+    df_filtered = df[(df['budget'] >= selected_budget[0]) & (df['budget'] <= selected_budget[1])]
+    # Plot a scatterplot with regression line of budget vs score
+    fig = px.scatter(
+        df_filtered,
+        x='budget',
+        y='score',
+        hover_data=['name', 'score', 'budget'],  # hover over data point
+        labels={'budget': 'Budget', 'score': 'IMDb Score'},
+        title='IMDb Score vs Budget',
+        trendline='ols', # add regression line
+        trendline_color_override='red'
+    )
+    st.plotly_chart(fig)
+if __name__ == '__main__':
+    run()

imdb.jpeg ADDED Viewed

model_svr.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed32c463ffb26e78a853995187069f3825027ce2bb4b15a54b8c48c42f31c66
+size 322357

movies.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

prediction.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import streamlit as st
+import pandas as pd
+import pickle
+# Load the pre-trained model
+with open('model_svr.pkl', 'rb') as file_1:
+    model_svr = pickle.load(file_1)
+def run():
+    # Create title
+    st.title('IMDb Movie Score Prediction')
+    # Create subheader
+    st.subheader('Calculate IMDb Score of Movies')
+    # Create a form for input
+    with st.form('form_movie_prediction'):
+        # Text inputs
+        name = st.text_input('Movie Name: ', value = '')
+        director = st.text_input('Director: ', value = '')
+        writer = st.text_input('Writer: ', value = '')
+        star = st.text_input('Star: ', value = '')
+        country = st.text_input('Country: ', value = '')
+        company = st.text_input('Production Company: ', value ='')
+        released = st.text_input('Date Released: ', value = '')
+        # Number inputs
+        year = st.number_input('Release Year: ', value=2022, min_value=1900, max_value=2100)
+        budget = st.number_input('Budget ($): ', value=500000000, min_value=0)
+        gross = st.number_input('Gross Revenue ($): ', value=958000000, min_value=0)
+        runtime = st.number_input('Runtime (minutes): ', value=189, min_value=1)
+        votes = st.number_input('Votes: ', value=500000, min_value=0)
+        # Categorical inputs
+        rating = st.selectbox('Rating: ', ('G', 'PG', 'PG-13', 'R', 'NC-17'), index=3)
+        genre = st.selectbox('Genre: ', ('Action', 'Adventure', 'Comedy', 'Drama', 'History', 'Sci-Fi', 'Thriller'), index=4)
+        # Submit button
+        submitted = st.form_submit_button('Predict IMDb Score')
+    # Prepare the data for prediction
+    data_inf = {
+        'name': name,
+        'rating': rating,
+        'genre': genre,
+        'year': year,
+        'released': released,
+        'votes': votes,
+        'director': director,
+        'writer': writer,
+        'star': star,
+        'country': country,
+        'budget': budget,
+        'gross': gross,
+        'company': company,
+        'runtime': runtime
+    }
+    data_inf = pd.DataFrame([data_inf])
+    st.dataframe(data_inf)
+    if submitted:
+        # Predict IMDb score for Oppenheimer using the SVR model
+        prediction = model_svr.predict(data_inf)
+        st.write('## Predicted IMDb Score: ', str(round(prediction[0], 2)))
+if __name__ == '__main__':
+    run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+pandas
+seaborn
+matplotlib
+numpy
+scikit-learn == 1.5.1
+Pillow
+plotly
+statsmodels