diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ac0b970ed1083255408aaa9928f6ab3790202b84 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+report/images/df_missing.png filter=lfs diff=lfs merge=lfs -text
+report/images/wordcloud_overview.png filter=lfs diff=lfs merge=lfs -text
+report/images/wordcloud_title.png filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 15ded036facde9bb830588697cc49ac65c123c8d..39aa398a63b86c5b78554d272b6afed45efb1207 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,75 @@
----
-title: Final ML Project
-emoji: 🏆
-colorFrom: purple
-colorTo: purple
-sdk: gradio
-sdk_version: 5.44.1
-app_file: app.py
-pinned: false
-license: mit
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# MovieLens Movie Data Analysis
+
+This project provides a reproducible pipeline for preprocessing and exploratory data analysis (EDA) on the MovieLens movie dataset.
+
+## Project Structure
+
+```
+.
+├── app/
+│ └── Practical.py # Main entry point for running the pipeline
+├── src/
+│ ├── preprocessing.py # Data loading, cleaning, merging
+│ └── eda.py # EDA and visualization (plots saved to /report/images)
+├── notebooks/
+│ └── Practical.ipynb # Step-by-step notebook for exploration and prototyping
+├── report/
+│ └── images/ # Output directory for all generated plots and images
+├── data/
+│ ├── raw/ # Raw input data (CSV files)
+│ ├── interim/ # Cleaned/intermediate CSVs
+│ └── processed/ # (Optional) Final processed data
+├── requirements.txt # Python dependencies
+└── README.md # This file
+```
+
+## How to Run
+
+1. **Install dependencies**
+ Make sure you have Python 3.8+ and run:
+ ```
+ pip install -r requirements.txt
+ ```
+
+2. **Prepare data**
+ Place the raw MovieLens CSV files in `data/raw/` as:
+ - `movies_metadata.csv`
+ - `credits.csv`
+ - `keywords.csv`
+ - `links.csv`
+ - `ratings.csv`
+
+3. **Run the pipeline**
+ ```
+ python app/Practical.py
+ ```
+ This will:
+ - Clean and merge the data
+ - Save interim cleaned CSVs to `data/interim/`
+ - Generate all EDA plots and wordclouds, saving them to `report/images/`
+ - Save interactive Plotly plots as PNG (requires [kaleido](https://github.com/plotly/Kaleido)) or HTML fallback
+
+## Features
+
+- **Modular Preprocessing**: All data cleaning, merging, and type handling in `src/preprocessing.py`
+- **Automated EDA**: All plots and wordclouds generated and saved by `src/eda.py`
+- **Reproducibility**: One-command run for the entire workflow
+- **Notebook**: `notebooks/Practical.ipynb` for step-by-step exploration
+
+## Requirements
+
+- pandas
+- numpy
+- matplotlib
+- seaborn
+- missingno
+- wordcloud
+- plotly
+- pycountry
+- kaleido (for static plotly image export)
+
+## Notes
+
+- If static Plotly image export fails, HTML versions of the plots are saved as a fallback.
+- All output images are saved in `report/images/`.
+- Adjust paths in `src/eda.py` and `src/preprocessing.py` if your
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c21615babd420d231d31ce31d06ba6b0de6c659
--- /dev/null
+++ b/app.py
@@ -0,0 +1,63 @@
+import gradio as gr
+import pickle
+import pandas as pd
+import os
+
+# Paths
+MODEL_DIR = "models"
+MOVIE_DATA_PATH = "data/movies.csv" # adjust to your actual metadata file
+
+# Load models (choose what you want to demo)
+with open(os.path.join(MODEL_DIR, "recommender_svd_mf.pkl"), "rb") as f:
+ svd_model = pickle.load(f)
+
+# Load movie metadata
+movies_df = pd.read_csv(MOVIE_DATA_PATH) # should include [movieId, title, poster_url, actors]
+
+def recommend(user_id, top_k=5):
+ """Generate top-k recommendations using SVD model."""
+ # Predict scores for all movies for this user
+ all_movie_ids = movies_df["movieId"].unique()
+ predictions = []
+ for mid in all_movie_ids:
+ try:
+ est = svd_model.predict(str(user_id), str(mid)).est
+ predictions.append((mid, est))
+ except Exception:
+ continue
+
+ # Sort and pick top_k
+ top_movies = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_k]
+
+ # Build output
+ results = []
+ for mid, score in top_movies:
+ row = movies_df[movies_df["movieId"] == mid].iloc[0]
+ explanation = f"Because you liked movies with {row.get('actors', 'similar style')}."
+ results.append((row["title"], row.get("poster_url", None), explanation))
+
+ return results
+
+def format_output(results):
+ titles = [r[0] for r in results]
+ posters = [r[1] for r in results if r[1] is not None]
+ explanations = [r[2] for r in results]
+ return titles, posters, explanations
+
+demo = gr.Interface(
+ fn=lambda user_id, k: format_output(recommend(user_id, k)),
+ inputs=[
+ gr.Number(label="User ID"),
+ gr.Slider(1, 10, value=5, step=1, label="Top-K")
+ ],
+ outputs=[
+ gr.Textbox(label="Recommended Movies"),
+ gr.Gallery(label="Posters").style(grid=[3], height="auto"),
+ gr.Textbox(label="Explanations")
+ ],
+ title="Movie Recommender System",
+ description="Enter your User ID to get top-K movie recommendations with posters and explanations."
+)
+
+if __name__ == "__main__":
+ demo.launch()
diff --git a/app/practical.py b/app/practical.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d69a43f4e0c736f668bfdb9fd11b1eeb4bcb32
--- /dev/null
+++ b/app/practical.py
@@ -0,0 +1,115 @@
+import sys
+import os
+
+# Add the parent directory to sys.path so 'src' can be imported
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from src.preprocessing import Preprocessing
+from src.eda import EDA
+from src.feature_engineering import FeatureEngineering
+from src.modeling import RecommenderModels
+from src.evaluation import leave_one_out_by_timestamp, evaluate_all, summarize_results
+
+def main():
+ print("========== Step 1: Preprocessing ==========")
+ preprocessor = Preprocessing()
+ dfs = preprocessor.run_all()
+
+ # print("========== Step 2: Exploratory Data Analysis (EDA) ==========")
+ # eda = EDA(dfs)
+ # eda.run_all()
+
+ print("========== Step 3: Feature Engineering ==========")
+ fe = FeatureEngineering(dfs)
+ fe_outputs = fe.run_all()
+ merged_df = fe_outputs["merged_df"]
+ merged_df_with_tfidf = fe_outputs["merged_df_with_tfidf"]
+ unique_movies_reduced = fe_outputs["unique_movies_reduced"]
+ ratings_df = dfs["ratings_df"]
+
+ print("========== Step 4: Modeling & Recommendation ==========")
+ models = RecommenderModels(
+ merged_df_with_tfidf=merged_df_with_tfidf,
+ unique_movies_reduced=unique_movies_reduced,
+ ratings_df=ratings_df
+ )
+ models.fit_popularity()
+ models.fit_content_based()
+ models.fit_cf()
+ print("CF RMSEs (kNN, SVD):", models.evaluate_cf())
+ rmse_scores, best_alpha = models.tune_hybrid_alpha()
+ print("Best alpha:", best_alpha)
+ print("Hybrid RMSE:", models.evaluate_hybrid())
+ models.save_models()
+ # Example: get recommendations for user 1
+ print("Top 10 Content-Based Recommendations for user 1:")
+ print(models.get_content_based_recommendations(user_id=1, top_n=10))
+
+ print("========== Step 5: Evaluation ==========")
+ # Time-aware split
+ train_ratings, test_ratings = leave_one_out_by_timestamp(ratings_df)
+ all_items = set(merged_df_with_tfidf['movieId'].astype(str).unique())
+ item_popularity = merged_df_with_tfidf['movieId'].value_counts().to_dict()
+ item_popularity = {str(k): v for k, v in item_popularity.items()}
+ svd_cols = [col for col in unique_movies_reduced.columns if col.startswith("svd_")]
+ item_features = {
+ str(row.movieId): row[svd_cols].values
+ for _, row in unique_movies_reduced.iterrows()
+ }
+
+ # Generate predictions for each model
+ # Implement prediction methods if not present in RecommenderModels
+ def predict_content_based(models, test_df):
+ preds = []
+ for _, row in test_df.iterrows():
+ user_id = row['userId']
+ movie_id = row['movieId']
+ true_rating = row['rating']
+ pred_rating = models.get_content_based_score(user_id, movie_id)
+ preds.append((user_id, movie_id, true_rating, pred_rating, {}))
+ return preds
+
+ def predict_collaborative(models, test_df):
+ preds = []
+ for _, row in test_df.iterrows():
+ user_id = row['userId']
+ movie_id = row['movieId']
+ true_rating = row['rating']
+ # Use SVD as the collaborative model (or knn_user_based if you prefer)
+ try:
+ pred_rating = models.svd_mf.predict(str(user_id), str(movie_id)).est
+ except Exception:
+ pred_rating = 0
+ preds.append((user_id, movie_id, true_rating, pred_rating, {}))
+ return preds
+
+ def predict_hybrid(models, test_df, alpha):
+ preds = []
+ for _, row in test_df.iterrows():
+ user_id = row['userId']
+ movie_id = row['movieId']
+ true_rating = row['rating']
+ pred_rating = models.hybrid_prediction(user_id, movie_id, alpha)
+ preds.append((user_id, movie_id, true_rating, pred_rating, {}))
+ return preds
+
+ predictions_cb = predict_content_based(models, test_ratings)
+ predictions_cf = predict_collaborative(models, test_ratings)
+ predictions_hybrid = predict_hybrid(models, test_ratings, alpha=best_alpha)
+
+ # Evaluate
+ results_cb = evaluate_all(predictions_cb, test_ratings.values, all_items, item_popularity, item_features)
+ results_cf = evaluate_all(predictions_cf, test_ratings.values, all_items, item_popularity, item_features)
+ results_hybrid = evaluate_all(predictions_hybrid, test_ratings.values, all_items, item_popularity, item_features)
+
+ # Print summary table
+ summary = summarize_results({
+ "Content-Based": results_cb,
+ "Collaborative": results_cf,
+ "Hybrid": results_hybrid
+ })
+ print(summary)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/document.pdf b/document.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2952c4d1c98c004436842dfea26881b91e1e3c4d
Binary files /dev/null and b/document.pdf differ
diff --git a/models/recommender_knn_user_based.pkl b/models/recommender_knn_user_based.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..678685fd7c038a3b90f72a1b2eec73c435160343
--- /dev/null
+++ b/models/recommender_knn_user_based.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c2691486ff062618b6cc06aa00397ce7abc72d84a0f2f015e24d1c720ef9a6b
+size 5949691
diff --git a/models/recommender_merged_df_with_tfidf.pkl b/models/recommender_merged_df_with_tfidf.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..30cc6db51cdd257dd261606d66f1b0319dfdc804
--- /dev/null
+++ b/models/recommender_merged_df_with_tfidf.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c0ea5f78039d884abaf095cf23c3320976d8db9865e522136ae27a375c89662
+size 166955859
diff --git a/models/recommender_popular_movies_unique.pkl b/models/recommender_popular_movies_unique.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..2fe8da1f9b855b7e4f4cd2e7d9e00a8668d3a601
--- /dev/null
+++ b/models/recommender_popular_movies_unique.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:095a203a1742406085858fc637e83a6f94e1860ac686c572ec75a2cae80511f6
+size 2922773
diff --git a/models/recommender_svd_mf.pkl b/models/recommender_svd_mf.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..ab07766f30de05eb3e46c97e1925e58544f31187
--- /dev/null
+++ b/models/recommender_svd_mf.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29962c5fd250f73d6d2003b88a13bc2b0ee452c93fa44ee2f69fcb410a2f8770
+size 9661411
diff --git a/models/recommender_unique_movies_reduced.pkl b/models/recommender_unique_movies_reduced.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e8777e974e18d3ea9f66fd7a19b2cb7713884a9e
--- /dev/null
+++ b/models/recommender_unique_movies_reduced.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddc6fc18ff578c47c86ff53abf985f7af9a74146b5fe265666ad99844b530c85
+size 21384963
diff --git a/models/recommender_user_profiles.pkl b/models/recommender_user_profiles.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1baa2adfbfc557819619b180ac366a38754570fd
--- /dev/null
+++ b/models/recommender_user_profiles.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd803357ddf7dac92ecd6351beb6b980b56d7a514c540a78fa6261965170c68
+size 1490
diff --git a/notebooks/practical.ipynb b/notebooks/practical.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d50d6fe215e6035b88110e9416b0006692178ae0
--- /dev/null
+++ b/notebooks/practical.ipynb
@@ -0,0 +1,11930 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "toc_visible": true,
+ "gpuType": "T4",
+ "collapsed_sections": [
+ "mItH-mJFzmPq",
+ "fOlLSfTdzxar",
+ "f2zJ1cYb0tEU"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "accelerator": "GPU"
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Handeling Data"
+ ],
+ "metadata": {
+ "id": "mItH-mJFzmPq"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Load data and quick overview"
+ ],
+ "metadata": {
+ "id": "fOlLSfTdzxar"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# import numpy as np\n",
+ "# print(np.__version__)"
+ ],
+ "metadata": {
+ "id": "bZi0wbANxzTM"
+ },
+ "execution_count": 1,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# !pip install scikit-surprise"
+ ],
+ "metadata": {
+ "id": "YCyPG_JUxLwi"
+ },
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# # Downgrade NumPy and reinstall surprise to link against it\n",
+ "# !pip install \"numpy<2.0\"\n",
+ "# !pip install scikit-surprise"
+ ],
+ "metadata": {
+ "id": "6N-4V-S9xJuz"
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FGbT_cqIAICt",
+ "outputId": "31e73861-599c-46c3-f64c-ab4643759f7a"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Using Colab cache for faster access to the 'the-movies-dataset' dataset.\n",
+ "Path to dataset files: /kaggle/input/the-movies-dataset\n"
+ ]
+ }
+ ],
+ "source": [
+ "import kagglehub\n",
+ "\n",
+ "# Download latest version\n",
+ "path = kagglehub.dataset_download(\"rounakbanik/the-movies-dataset\")\n",
+ "\n",
+ "print(\"Path to dataset files:\", path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "import warnings\n",
+ "from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler\n",
+ "warnings.filterwarnings('ignore')"
+ ],
+ "metadata": {
+ "id": "0LwPddXuATgs"
+ },
+ "execution_count": 5,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "main_path = \"/kaggle/input/the-movies-dataset/\"\n",
+ "movies_metadata_path = main_path + \"movies_metadata.csv\"\n",
+ "credits_path = main_path + \"credits.csv\"\n",
+ "keywords_path = main_path + \"keywords.csv\"\n",
+ "links_path = main_path + \"links_small.csv\"\n",
+ "ratings_path = main_path + \"ratings_small.csv\""
+ ],
+ "metadata": {
+ "id": "DMXrDupJAZII"
+ },
+ "execution_count": 6,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.read_csv(movies_metadata_path)\n",
+ "df.head().transpose()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 888
+ },
+ "id": "w5ACPvkBAd6t",
+ "outputId": "e58c00fc-c6ed-4bc2-cccc-57b957b852ce"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " 0 \\\n",
+ "adult False \n",
+ "belongs_to_collection {'id': 10194, 'name': 'Toy Story Collection', ... \n",
+ "budget 30000000 \n",
+ "genres [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
+ "homepage http://toystory.disney.com/toy-story \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies [{'name': 'Pixar Animation Studios', 'id': 3}] \n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "status Released \n",
+ "tagline NaN \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "\n",
+ " 1 \\\n",
+ "adult False \n",
+ "belongs_to_collection NaN \n",
+ "budget 65000000 \n",
+ "genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
+ "homepage NaN \n",
+ "id 8844 \n",
+ "imdb_id tt0113497 \n",
+ "original_language en \n",
+ "original_title Jumanji \n",
+ "overview When siblings Judy and Peter discover an encha... \n",
+ "popularity 17.015539 \n",
+ "poster_path /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg \n",
+ "production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ "release_date 1995-12-15 \n",
+ "revenue 262797249.0 \n",
+ "runtime 104.0 \n",
+ "spoken_languages [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
+ "status Released \n",
+ "tagline Roll the dice and unleash the excitement! \n",
+ "title Jumanji \n",
+ "video False \n",
+ "vote_average 6.9 \n",
+ "vote_count 2413.0 \n",
+ "\n",
+ " 2 \\\n",
+ "adult False \n",
+ "belongs_to_collection {'id': 119050, 'name': 'Grumpy Old Men Collect... \n",
+ "budget 0 \n",
+ "genres [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
+ "homepage NaN \n",
+ "id 15602 \n",
+ "imdb_id tt0113228 \n",
+ "original_language en \n",
+ "original_title Grumpier Old Men \n",
+ "overview A family wedding reignites the ancient feud be... \n",
+ "popularity 11.7129 \n",
+ "poster_path /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg \n",
+ "production_companies [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ "release_date 1995-12-22 \n",
+ "revenue 0.0 \n",
+ "runtime 101.0 \n",
+ "spoken_languages [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "status Released \n",
+ "tagline Still Yelling. Still Fighting. Still Ready for... \n",
+ "title Grumpier Old Men \n",
+ "video False \n",
+ "vote_average 6.5 \n",
+ "vote_count 92.0 \n",
+ "\n",
+ " 3 \\\n",
+ "adult False \n",
+ "belongs_to_collection NaN \n",
+ "budget 16000000 \n",
+ "genres [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
+ "homepage NaN \n",
+ "id 31357 \n",
+ "imdb_id tt0114885 \n",
+ "original_language en \n",
+ "original_title Waiting to Exhale \n",
+ "overview Cheated on, mistreated and stepped on, the wom... \n",
+ "popularity 3.859495 \n",
+ "poster_path /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg \n",
+ "production_companies [{'name': 'Twentieth Century Fox Film Corporat... \n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ "release_date 1995-12-22 \n",
+ "revenue 81452156.0 \n",
+ "runtime 127.0 \n",
+ "spoken_languages [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "status Released \n",
+ "tagline Friends are the people who let you be yourself... \n",
+ "title Waiting to Exhale \n",
+ "video False \n",
+ "vote_average 6.1 \n",
+ "vote_count 34.0 \n",
+ "\n",
+ " 4 \n",
+ "adult False \n",
+ "belongs_to_collection {'id': 96871, 'name': 'Father of the Bride Col... \n",
+ "budget 0 \n",
+ "genres [{'id': 35, 'name': 'Comedy'}] \n",
+ "homepage NaN \n",
+ "id 11862 \n",
+ "imdb_id tt0113041 \n",
+ "original_language en \n",
+ "original_title Father of the Bride Part II \n",
+ "overview Just when George Banks has recovered from his ... \n",
+ "popularity 8.387519 \n",
+ "poster_path /e64sOI48hQXyru7naBFyssKFxVd.jpg \n",
+ "production_companies [{'name': 'Sandollar Productions', 'id': 5842}... \n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ "release_date 1995-02-10 \n",
+ "revenue 76578911.0 \n",
+ "runtime 106.0 \n",
+ "spoken_languages [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "status Released \n",
+ "tagline Just When His World Is Back To Normal... He's ... \n",
+ "title Father of the Bride Part II \n",
+ "video False \n",
+ "vote_average 5.7 \n",
+ "vote_count 173.0 "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " adult \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " belongs_to_collection \n",
+ " {'id': 10194, 'name': 'Toy Story Collection', ... \n",
+ " NaN \n",
+ " {'id': 119050, 'name': 'Grumpy Old Men Collect... \n",
+ " NaN \n",
+ " {'id': 96871, 'name': 'Father of the Bride Col... \n",
+ " \n",
+ " \n",
+ " budget \n",
+ " 30000000 \n",
+ " 65000000 \n",
+ " 0 \n",
+ " 16000000 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " genres \n",
+ " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
+ " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
+ " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
+ " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
+ " [{'id': 35, 'name': 'Comedy'}] \n",
+ " \n",
+ " \n",
+ " homepage \n",
+ " http://toystory.disney.com/toy-story \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " id \n",
+ " 862 \n",
+ " 8844 \n",
+ " 15602 \n",
+ " 31357 \n",
+ " 11862 \n",
+ " \n",
+ " \n",
+ " imdb_id \n",
+ " tt0114709 \n",
+ " tt0113497 \n",
+ " tt0113228 \n",
+ " tt0114885 \n",
+ " tt0113041 \n",
+ " \n",
+ " \n",
+ " original_language \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " \n",
+ " \n",
+ " original_title \n",
+ " Toy Story \n",
+ " Jumanji \n",
+ " Grumpier Old Men \n",
+ " Waiting to Exhale \n",
+ " Father of the Bride Part II \n",
+ " \n",
+ " \n",
+ " overview \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " When siblings Judy and Peter discover an encha... \n",
+ " A family wedding reignites the ancient feud be... \n",
+ " Cheated on, mistreated and stepped on, the wom... \n",
+ " Just when George Banks has recovered from his ... \n",
+ " \n",
+ " \n",
+ " popularity \n",
+ " 21.946943 \n",
+ " 17.015539 \n",
+ " 11.7129 \n",
+ " 3.859495 \n",
+ " 8.387519 \n",
+ " \n",
+ " \n",
+ " poster_path \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg \n",
+ " /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg \n",
+ " /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg \n",
+ " /e64sOI48hQXyru7naBFyssKFxVd.jpg \n",
+ " \n",
+ " \n",
+ " production_companies \n",
+ " [{'name': 'Pixar Animation Studios', 'id': 3}] \n",
+ " [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n",
+ " [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n",
+ " [{'name': 'Twentieth Century Fox Film Corporat... \n",
+ " [{'name': 'Sandollar Productions', 'id': 5842}... \n",
+ " \n",
+ " \n",
+ " production_countries \n",
+ " [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ " [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ " [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ " [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ " [{'iso_3166_1': 'US', 'name': 'United States o... \n",
+ " \n",
+ " \n",
+ " release_date \n",
+ " 1995-10-30 \n",
+ " 1995-12-15 \n",
+ " 1995-12-22 \n",
+ " 1995-12-22 \n",
+ " 1995-02-10 \n",
+ " \n",
+ " \n",
+ " revenue \n",
+ " 373554033.0 \n",
+ " 262797249.0 \n",
+ " 0.0 \n",
+ " 81452156.0 \n",
+ " 76578911.0 \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 81.0 \n",
+ " 104.0 \n",
+ " 101.0 \n",
+ " 127.0 \n",
+ " 106.0 \n",
+ " \n",
+ " \n",
+ " spoken_languages \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ " \n",
+ " \n",
+ " status \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " \n",
+ " \n",
+ " tagline \n",
+ " NaN \n",
+ " Roll the dice and unleash the excitement! \n",
+ " Still Yelling. Still Fighting. Still Ready for... \n",
+ " Friends are the people who let you be yourself... \n",
+ " Just When His World Is Back To Normal... He's ... \n",
+ " \n",
+ " \n",
+ " title \n",
+ " Toy Story \n",
+ " Jumanji \n",
+ " Grumpier Old Men \n",
+ " Waiting to Exhale \n",
+ " Father of the Bride Part II \n",
+ " \n",
+ " \n",
+ " video \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 7.7 \n",
+ " 6.9 \n",
+ " 6.5 \n",
+ " 6.1 \n",
+ " 5.7 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 5415.0 \n",
+ " 2413.0 \n",
+ " 92.0 \n",
+ " 34.0 \n",
+ " 173.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Missing values in Train DataFrame:\")\n",
+ "missing_df = df.isnull().sum().sort_values(ascending=False)\n",
+ "missing_df_percent = (missing_df / len(df)) * 100\n",
+ "missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent})\n",
+ "display(missing_df_info[missing_df_info['Missing Count'] > 0])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 663
+ },
+ "id": "-qjw9Wl86F70",
+ "outputId": "a39dc425-e3d5-4349-b9d3-141b7eb76729"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Missing values in Train DataFrame:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " Missing Count Missing Percentage (%)\n",
+ "belongs_to_collection 40972 90.115691\n",
+ "homepage 37684 82.883913\n",
+ "tagline 25054 55.104914\n",
+ "overview 954 2.098271\n",
+ "poster_path 386 0.848986\n",
+ "runtime 263 0.578454\n",
+ "status 87 0.191352\n",
+ "release_date 87 0.191352\n",
+ "imdb_id 17 0.037391\n",
+ "original_language 11 0.024194\n",
+ "vote_average 6 0.013197\n",
+ "vote_count 6 0.013197\n",
+ "title 6 0.013197\n",
+ "video 6 0.013197\n",
+ "spoken_languages 6 0.013197\n",
+ "revenue 6 0.013197\n",
+ "popularity 5 0.010997\n",
+ "production_countries 3 0.006598\n",
+ "production_companies 3 0.006598"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Missing Count \n",
+ " Missing Percentage (%) \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " belongs_to_collection \n",
+ " 40972 \n",
+ " 90.115691 \n",
+ " \n",
+ " \n",
+ " homepage \n",
+ " 37684 \n",
+ " 82.883913 \n",
+ " \n",
+ " \n",
+ " tagline \n",
+ " 25054 \n",
+ " 55.104914 \n",
+ " \n",
+ " \n",
+ " overview \n",
+ " 954 \n",
+ " 2.098271 \n",
+ " \n",
+ " \n",
+ " poster_path \n",
+ " 386 \n",
+ " 0.848986 \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 263 \n",
+ " 0.578454 \n",
+ " \n",
+ " \n",
+ " status \n",
+ " 87 \n",
+ " 0.191352 \n",
+ " \n",
+ " \n",
+ " release_date \n",
+ " 87 \n",
+ " 0.191352 \n",
+ " \n",
+ " \n",
+ " imdb_id \n",
+ " 17 \n",
+ " 0.037391 \n",
+ " \n",
+ " \n",
+ " original_language \n",
+ " 11 \n",
+ " 0.024194 \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 6 \n",
+ " 0.013197 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 6 \n",
+ " 0.013197 \n",
+ " \n",
+ " \n",
+ " title \n",
+ " 6 \n",
+ " 0.013197 \n",
+ " \n",
+ " \n",
+ " video \n",
+ " 6 \n",
+ " 0.013197 \n",
+ " \n",
+ " \n",
+ " spoken_languages \n",
+ " 6 \n",
+ " 0.013197 \n",
+ " \n",
+ " \n",
+ " revenue \n",
+ " 6 \n",
+ " 0.013197 \n",
+ " \n",
+ " \n",
+ " popularity \n",
+ " 5 \n",
+ " 0.010997 \n",
+ " \n",
+ " \n",
+ " production_countries \n",
+ " 3 \n",
+ " 0.006598 \n",
+ " \n",
+ " \n",
+ " production_companies \n",
+ " 3 \n",
+ " 0.006598 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(missing_df_info[missing_df_info['Missing Count'] > 0])\",\n \"rows\": 19,\n \"fields\": [\n {\n \"column\": \"Missing Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13210,\n \"min\": 3,\n \"max\": 40972,\n \"num_unique_values\": 12,\n \"samples\": [\n 5,\n 6,\n 40972\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Missing Percentage (%)\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 29.05500420393128,\n \"min\": 0.006598337219020807,\n \"max\": 90.11569084590683,\n \"num_unique_values\": 12,\n \"samples\": [\n 0.010997228698368012,\n 0.013196674438041614,\n 90.11569084590683\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ca536ab9"
+ },
+ "source": [
+ "## Missing values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cb672ebd",
+ "outputId": "d0191704-7afa-4589-e0e6-3020049d224f"
+ },
+ "source": [
+ "# Analyze missing value percentages from missing_df_info\n",
+ "high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist()\n",
+ "moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist()\n",
+ "low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist()\n",
+ "\n",
+ "print(\"Columns with high missing percentage (>50%):\", high_missing_cols)\n",
+ "print(\"Columns with moderate missing percentage (1%-50%):\", moderate_missing_cols)\n",
+ "print(\"Columns with low missing percentage (<=1%):\", low_missing_cols)\n",
+ "\n",
+ "# Determine handling strategy for each column with missing values\n",
+ "handling_strategy = {}\n",
+ "\n",
+ "# High missing percentage: Consider dropping\n",
+ "for col in high_missing_cols:\n",
+ " handling_strategy[col] = \"Drop column due to high missing percentage\"\n",
+ "\n",
+ "# Moderate missing percentage: Consider imputation based on data type\n",
+ "# 'overview' is text, imputation might not be meaningful, could consider dropping or using a placeholder\n",
+ "if 'overview' in moderate_missing_cols:\n",
+ " handling_strategy['overview'] = \"Consider dropping or using a placeholder for text data\"\n",
+ "\n",
+ "# Low missing percentage: Consider imputation or dropping rows\n",
+ "# For numerical columns like 'runtime', 'vote_average', 'vote_count', 'revenue', 'popularity', consider mean/median imputation\n",
+ "numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']\n",
+ "for col in low_missing_cols:\n",
+ " if col in numerical_low_missing:\n",
+ " handling_strategy[col] = \"Impute with mean or median\"\n",
+ " elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']:\n",
+ " handling_strategy[col] = \"Consider imputation (e.g., mode, placeholder) or dropping rows\"\n",
+ "\n",
+ "\n",
+ "# Special handling for 'adult' column based on previous observation\n",
+ "handling_strategy['adult'] = \"Investigate and potentially remove incorrect entries (e.g., 'R')\"\n",
+ "\n",
+ "\n",
+ "print(\"\\nIdentified Handling Strategy for each column with missing values:\")\n",
+ "for col, strategy in handling_strategy.items():\n",
+ " print(f\"- {col}: {strategy}\")"
+ ],
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Columns with high missing percentage (>50%): ['belongs_to_collection', 'homepage', 'tagline']\n",
+ "Columns with moderate missing percentage (1%-50%): ['overview']\n",
+ "Columns with low missing percentage (<=1%): ['poster_path', 'runtime', 'status', 'release_date', 'imdb_id', 'original_language', 'vote_average', 'vote_count', 'title', 'video', 'spoken_languages', 'revenue', 'popularity', 'production_countries', 'production_companies', 'genres', 'id', 'adult', 'budget', 'original_title']\n",
+ "\n",
+ "Identified Handling Strategy for each column with missing values:\n",
+ "- belongs_to_collection: Drop column due to high missing percentage\n",
+ "- homepage: Drop column due to high missing percentage\n",
+ "- tagline: Drop column due to high missing percentage\n",
+ "- overview: Consider dropping or using a placeholder for text data\n",
+ "- poster_path: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- runtime: Impute with mean or median\n",
+ "- status: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- release_date: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- imdb_id: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- original_language: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- vote_average: Impute with mean or median\n",
+ "- vote_count: Impute with mean or median\n",
+ "- title: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- video: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- spoken_languages: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- revenue: Impute with mean or median\n",
+ "- popularity: Impute with mean or median\n",
+ "- production_countries: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- production_companies: Consider imputation (e.g., mode, placeholder) or dropping rows\n",
+ "- adult: Investigate and potentially remove incorrect entries (e.g., 'R')\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 798
+ },
+ "id": "7e6ffa12",
+ "outputId": "c0d8f225-fd5b-4a5b-d659-660cc75647db"
+ },
+ "source": [
+ "# 1. Drop columns with high missing percentages\n",
+ "cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline']\n",
+ "df = df.drop(columns=cols_to_drop)\n",
+ "\n",
+ "# 2. Fill missing values in 'overview' with a placeholder\n",
+ "df['overview'] = df['overview'].fillna('No overview available')\n",
+ "\n",
+ "# Inspect the 'popularity' column to identify the non-numeric values\n",
+ "print(df['popularity'].dtype)\n",
+ "print(df['popularity'].unique())\n",
+ "\n",
+ "# Attempt to convert 'popularity' to numeric, coercing errors\n",
+ "df['popularity'] = pd.to_numeric(df['popularity'], errors='coerce')\n",
+ "\n",
+ "# Now retry the imputation for numerical columns\n",
+ "numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']\n",
+ "for col in numerical_cols_to_impute:\n",
+ " if col in df.columns:\n",
+ " median_val = df[col].median()\n",
+ " df[col] = df[col].fillna(median_val)\n",
+ "\n",
+ "# Fill missing values in remaining columns with low missing percentages with a placeholder\n",
+ "remaining_missing_cols = df.columns[df.isnull().any()].tolist()\n",
+ "cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult']\n",
+ "\n",
+ "for col in cols_to_fill_unknown:\n",
+ " df[col] = df[col].fillna('Unknown')\n",
+ "\n",
+ "# Investigate and remove incorrect entries in 'adult' column\n",
+ "df = df[df['adult'].isin(['True', 'False'])]\n",
+ "\n",
+ "# Verify that missing values have been handled\n",
+ "print(\"Missing values after handling:\")\n",
+ "display(df.isnull().sum().sort_values(ascending=False))"
+ ],
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "object\n",
+ "[21.946943 17.015539 11.7129 ... '0.903007' '0.003503' '0.163015']\n",
+ "Missing values after handling:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "adult 0\n",
+ "budget 0\n",
+ "genres 0\n",
+ "id 0\n",
+ "imdb_id 0\n",
+ "original_language 0\n",
+ "original_title 0\n",
+ "overview 0\n",
+ "popularity 0\n",
+ "poster_path 0\n",
+ "production_companies 0\n",
+ "production_countries 0\n",
+ "release_date 0\n",
+ "revenue 0\n",
+ "runtime 0\n",
+ "spoken_languages 0\n",
+ "status 0\n",
+ "title 0\n",
+ "video 0\n",
+ "vote_average 0\n",
+ "vote_count 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " adult \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " budget \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " genres \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " id \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " imdb_id \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " original_language \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " original_title \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " overview \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " popularity \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " poster_path \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " production_companies \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " production_countries \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " release_date \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " revenue \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " spoken_languages \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " status \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " title \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " video \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "nojk-XGx_FVC",
+ "outputId": "e523e653-dc9c-42ea-c22e-6b41e2928110"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(45463, 21)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits_df = pd.read_csv(credits_path)\n",
+ "keywords_df = pd.read_csv(keywords_path)\n",
+ "links_df = pd.read_csv(links_path)\n",
+ "ratings_df = pd.read_csv(ratings_path)"
+ ],
+ "metadata": {
+ "id": "vlOqRvI2_KgD"
+ },
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits_df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "uzsL5e9V_phr",
+ "outputId": "13aaf194-564e-449d-aedc-9e3bc638bd21"
+ },
+ "execution_count": 13,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['cast', 'crew', 'id'], dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "aNvLTmpj_qwj",
+ "outputId": "e52b7497-9c0c-4ad6-be79-8edffc76b46a"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " cast \\\n",
+ "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
+ "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
+ "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
+ "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
+ "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
+ "\n",
+ " crew id \n",
+ "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
+ "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
+ "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
+ "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
+ "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " cast \n",
+ " crew \n",
+ " id \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
+ " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... \n",
+ " 862 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
+ " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... \n",
+ " 8844 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
+ " [{'credit_id': '52fe466a9251416c75077a89', 'de... \n",
+ " 15602 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
+ " [{'credit_id': '52fe44779251416c91011acb', 'de... \n",
+ " 31357 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " [{'cast_id': 1, 'character': 'George Banks', '... \n",
+ " [{'credit_id': '52fe44959251416c75039ed7', 'de... \n",
+ " 11862 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "credits_df",
+ "summary": "{\n \"name\": \"credits_df\",\n \"rows\": 45476,\n \"fields\": [\n {\n \"column\": \"cast\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 43019,\n \"samples\": [\n \"[{'cast_id': 2, 'character': 'Lazar Peacock/Sabata', 'credit_id': '52fe4bca9251416c7510e195', 'gender': 2, 'id': 20581, 'name': 'Jack Betts', 'order': 0, 'profile_path': '/f03shMGYcbPG2EyjkIVAR0YA1RA.jpg'}, {'cast_id': 3, 'character': 'Blonde', 'credit_id': '52fe4bca9251416c7510e199', 'gender': 0, 'id': 100683, 'name': 'Franco Borelli', 'order': 1, 'profile_path': None}, {'cast_id': 4, 'character': 'Roger Murdock', 'credit_id': '52fe4bca9251416c7510e19d', 'gender': 2, 'id': 30898, 'name': 'Gordon Mitchell', 'order': 2, 'profile_path': '/szzvsqfFlkHBUJEiZtRquIhxHqn.jpg'}, {'cast_id': 5, 'character': 'Maya', 'credit_id': '52fe4bca9251416c7510e1a1', 'gender': 1, 'id': 30902, 'name': 'Simonetta Vitelli', 'order': 3, 'profile_path': '/jMnRUMgLV3l6lyB26bd8t9b11m5.jpg'}]\",\n \"[{'cast_id': 2, 'character': 'Charles', 'credit_id': '590cf25dc3a36864c60039ff', 'gender': 2, 'id': 11276, 'name': 'Tim Pigott-Smith', 'order': 1, 'profile_path': '/yC5fQ2HYxzD5JqnXZKMJ6giExrU.jpg'}, {'cast_id': 3, 'character': 'Kate Middleton', 'credit_id': '590cf268c3a36864fc003a3b', 'gender': 1, 'id': 115679, 'name': 'Charlotte Riley', 'order': 2, 'profile_path': '/pkiZKysfb0oXvaBBm6zWQkWSvVu.jpg'}, {'cast_id': 4, 'character': 'William', 'credit_id': '590cf2769251414e85003b16', 'gender': 2, 'id': 31739, 'name': 'Oliver Chris', 'order': 3, 'profile_path': '/xTnUMtP5MREaHD86XfJ5mYibawq.jpg'}, {'cast_id': 5, 'character': 'Prime Minister Tristram Evans', 'credit_id': '590cf284c3a36864c6003a15', 'gender': 2, 'id': 47933, 'name': 'Adam James', 'order': 4, 'profile_path': '/4dSIRIEEnK2tC1OrgjEykUvOeFw.jpg'}, {'cast_id': 7, 'character': 'Harry', 'credit_id': '590cf2bd9251414e8d0038d1', 'gender': 0, 'id': 1409393, 'name': 'Richard Goulding', 'order': 6, 'profile_path': '/3vM6hrfcU4NLBvWoRxyo8nkPsDU.jpg'}, {'cast_id': 8, 'character': 'Coottsey', 'credit_id': '590cf2e5c3a36864ec0036e3', 'gender': 2, 'id': 1428460, 'name': 'Max Bennett', 'order': 7, 'profile_path': '/fthD8U3aGnQioWAiwvFPcdDQJRV.jpg'}, {'cast_id': 9, 'character': 'Jess', 'credit_id': '590ef2e29251414ead01c7d5', 'gender': 0, 'id': 1595457, 'name': 'Tamara Lawrance', 'order': 8, 'profile_path': None}, {'cast_id': 10, 'character': 'Camilla', 'credit_id': '590ef2f79251414eca01c988', 'gender': 1, 'id': 192933, 'name': 'Margot Leicester', 'order': 9, 'profile_path': '/M2PEeYUdkrd4VjI1D0lsHbiG8t.jpg'}, {'cast_id': 11, 'character': 'James Reiss', 'credit_id': '590ef30fc3a36864d401e229', 'gender': 2, 'id': 15740, 'name': 'Tim McMullan', 'order': 10, 'profile_path': '/8se9JhmD9LE6tiibkGiV51M8rdD.jpg'}, {'cast_id': 12, 'character': 'Mrs Stevens', 'credit_id': '590ef326c3a36864fc01d3f7', 'gender': 0, 'id': 62968, 'name': 'Priyanga Burford', 'order': 11, 'profile_path': '/yTxLb30QwUAs5aoErhnYsnGawG5.jpg'}, {'cast_id': 13, 'character': 'Diana', 'credit_id': '590ef334c3a36864fc01d3ff', 'gender': 1, 'id': 1528819, 'name': 'Katie Brayben', 'order': 12, 'profile_path': '/m7oOBu4cfamQ9wixTyWXNaH9sgn.jpg'}, {'cast_id': 14, 'character': 'Archbishop of Canterbury', 'credit_id': '590ef372c3a368650a01c818', 'gender': 2, 'id': 940, 'name': 'John Shrapnel', 'order': 13, 'profile_path': '/nDIK01IoVNx7cfYOrKqGugItqO9.jpg'}, {'cast_id': 15, 'character': 'Spencer', 'credit_id': '590ef37e9251414ead01c82c', 'gender': 0, 'id': 1455682, 'name': 'Parth Thakerar', 'order': 14, 'profile_path': None}]\",\n \"[{'cast_id': 1, 'character': 'Himself', 'credit_id': '52fe4a9bc3a368484e15d20d', 'gender': 0, 'id': 1078721, 'name': 'Armand Leroi', 'order': 0, 'profile_path': None}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"crew\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 44669,\n \"samples\": [\n \"[{'credit_id': '52fe461ac3a36847f80eccfd', 'department': 'Directing', 'gender': 2, 'id': 107463, 'job': 'Director', 'name': 'Del Tenney', 'profile_path': None}, {'credit_id': '52fe461ac3a36847f80ecd13', 'department': 'Writing', 'gender': 0, 'id': 107464, 'job': 'Screenplay', 'name': 'Richard Hilliard', 'profile_path': None}, {'credit_id': '52fe461ac3a36847f80ecd19', 'department': 'Production', 'gender': 2, 'id': 107463, 'job': 'Producer', 'name': 'Del Tenney', 'profile_path': None}]\",\n \"[{'credit_id': '52fe45439251416c9102c5bd', 'department': 'Directing', 'gender': 2, 'id': 93975, 'job': 'Director', 'name': 'Lewis Allen', 'profile_path': None}, {'credit_id': '52fe45439251416c9102c5c3', 'department': 'Writing', 'gender': 0, 'id': 111580, 'job': 'Novel', 'name': 'Tiffany Thayer', 'profile_path': None}, {'credit_id': '52fe45439251416c9102c5c9', 'department': 'Writing', 'gender': 2, 'id': 10148, 'job': 'Writer', 'name': 'Warren Duff', 'profile_path': None}, {'credit_id': '52fe45439251416c9102c5cf', 'department': 'Production', 'gender': 2, 'id': 50311, 'job': 'Producer', 'name': 'Robert Fellows', 'profile_path': None}, {'credit_id': '52fe45439251416c9102c5d5', 'department': 'Sound', 'gender': 2, 'id': 26026, 'job': 'Original Music Composer', 'name': 'Victor Young', 'profile_path': None}, {'credit_id': '52fe45439251416c9102c5db', 'department': 'Camera', 'gender': 2, 'id': 8620, 'job': 'Director of Photography', 'name': 'John F. Seitz', 'profile_path': '/6hvivkKP5H5NpPcAViAfUMFgqsu.jpg'}, {'credit_id': '52fe45439251416c9102c5e1', 'department': 'Editing', 'gender': 2, 'id': 30013, 'job': 'Editor', 'name': 'LeRoy Stone', 'profile_path': None}]\",\n \"[{'credit_id': '52fe45319251416c7504eab9', 'department': 'Writing', 'gender': 2, 'id': 14999, 'job': 'Screenplay', 'name': 'George A. Romero', 'profile_path': '/zNP7wdy48eNNJAAmM0pYbSelUAd.jpg'}, {'credit_id': '52fe45319251416c7504ea8b', 'department': 'Directing', 'gender': 2, 'id': 14999, 'job': 'Director', 'name': 'George A. Romero', 'profile_path': '/zNP7wdy48eNNJAAmM0pYbSelUAd.jpg'}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 112443,\n \"min\": 2,\n \"max\": 469172,\n \"num_unique_values\": 45432,\n \"samples\": [\n 43942,\n 30139,\n 85389\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits_df.isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 178
+ },
+ "id": "UcXtNeMC_t40",
+ "outputId": "8594c43f-73a4-48fd-aafe-71bc2c9d99ed"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "cast 0\n",
+ "crew 0\n",
+ "id 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " cast \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " crew \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " id \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "keywords_df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2zXQS7w8_6JE",
+ "outputId": "aa1177d4-7b46-4709-e546-e4b61560e246"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['id', 'keywords'], dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "keywords_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "Rg6YR2wS__R-",
+ "outputId": "3c47e0f7-6329-4e75-88c7-85fd3c319a9e"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id keywords\n",
+ "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
+ "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
+ "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
+ "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
+ "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " keywords \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 862 \n",
+ " [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 8844 \n",
+ " [{'id': 10090, 'name': 'board game'}, {'id': 1... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 15602 \n",
+ " [{'id': 1495, 'name': 'fishing'}, {'id': 12392... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 31357 \n",
+ " [{'id': 818, 'name': 'based on novel'}, {'id':... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 11862 \n",
+ " [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "keywords_df",
+ "summary": "{\n \"name\": \"keywords_df\",\n \"rows\": 46419,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 113045,\n \"min\": 2,\n \"max\": 469172,\n \"num_unique_values\": 45432,\n \"samples\": [\n 43942,\n 30139,\n 85389\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"keywords\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 25989,\n \"samples\": [\n \"[{'id': 2173, 'name': 'illegal immigration'}, {'id': 2356, 'name': 'immigrant'}, {'id': 187056, 'name': 'woman director'}, {'id': 193536, 'name': 'cyprus'}]\",\n \"[{'id': 5657, 'name': 'australia'}]\",\n \"[{'id': 220, 'name': 'berlin'}, {'id': 351, 'name': 'poison'}, {'id': 407, 'name': 'dictator'}, {'id': 1443, 'name': 'clerk'}, {'id': 1698, 'name': 'ideology'}, {'id': 1956, 'name': 'world war ii'}, {'id': 2052, 'name': 'traitor'}, {'id': 2300, 'name': 'despair'}, {'id': 2535, 'name': 'destroy'}, {'id': 2850, 'name': 'testament'}, {'id': 3054, 'name': 'capitulation'}, {'id': 3055, 'name': 'soviet troops'}, {'id': 3060, 'name': 'race politics'}, {'id': 3061, 'name': 'national socialism'}, {'id': 3063, 'name': 'adolf hitler'}, {'id': 3064, 'name': 'ultimate victory'}, {'id': 3065, 'name': 'minister'}, {'id': 3068, 'name': 'national socialist party'}, {'id': 3737, 'name': 'dying and death'}, {'id': 6165, 'name': 'historical figure'}, {'id': 33671, 'name': 'german shepherd'}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "keywords_df.isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 147
+ },
+ "id": "IIRGyn0LAFu3",
+ "outputId": "91569b0f-6a5c-48b0-9ffb-4f6f6c14ea0f"
+ },
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "keywords 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " keywords \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "links_df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1S5prkCHAAr1",
+ "outputId": "4296f32b-03cb-4a3a-c517-0cf5813febe1"
+ },
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "links_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "Z5vGH8oqACYq",
+ "outputId": "30be59a7-1281-41ea-dade-994314d293fe"
+ },
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " movieId imdbId tmdbId\n",
+ "0 1 114709 862.0\n",
+ "1 2 113497 8844.0\n",
+ "2 3 113228 15602.0\n",
+ "3 4 114885 31357.0\n",
+ "4 5 113041 11862.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " imdbId \n",
+ " tmdbId \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 114709 \n",
+ " 862.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 2 \n",
+ " 113497 \n",
+ " 8844.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 3 \n",
+ " 113228 \n",
+ " 15602.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 4 \n",
+ " 114885 \n",
+ " 31357.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 5 \n",
+ " 113041 \n",
+ " 11862.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "links_df",
+ "summary": "{\n \"name\": \"links_df\",\n \"rows\": 9125,\n \"fields\": [\n {\n \"column\": \"movieId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40782,\n \"min\": 1,\n \"max\": 164979,\n \"num_unique_values\": 9125,\n \"samples\": [\n 3890,\n 6033,\n 3922\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"imdbId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 743177,\n \"min\": 417,\n \"max\": 5794766,\n \"num_unique_values\": 9125,\n \"samples\": [\n 259207,\n 102500,\n 57887\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tmdbId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 62814.519801328424,\n \"min\": 2.0,\n \"max\": 416437.0,\n \"num_unique_values\": 9112,\n \"samples\": [\n 992.0,\n 13019.0,\n 10547.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "links_df.isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 178
+ },
+ "id": "n2ITpqPoAJ1L",
+ "outputId": "bc6680a3-c524-470a-8fd0-494fa20efe7e"
+ },
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "movieId 0\n",
+ "imdbId 0\n",
+ "tmdbId 13\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " imdbId \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " tmdbId \n",
+ " 13 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "links_df = links_df.dropna(subset=['tmdbId'])\n",
+ "print(\"Missing values in links_df after handling:\")\n",
+ "display(links_df.isnull().sum())"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 197
+ },
+ "id": "2qqxtTepAL2w",
+ "outputId": "e5fffcb2-dfac-4fb9-f2eb-eac24362f948"
+ },
+ "execution_count": 22,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Missing values in links_df after handling:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "movieId 0\n",
+ "imdbId 0\n",
+ "tmdbId 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " imdbId \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " tmdbId \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ratings_df.columns"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UqiVcys4AY3A",
+ "outputId": "246afaf7-c249-4262-d5a6-799478dc3213"
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ratings_df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "3aaLSv9cAaez",
+ "outputId": "44b94543-0406-49a0-e749-8f326959e8e3"
+ },
+ "execution_count": 24,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " userId movieId rating timestamp\n",
+ "0 1 31 2.5 1260759144\n",
+ "1 1 1029 3.0 1260759179\n",
+ "2 1 1061 3.0 1260759182\n",
+ "3 1 1129 2.0 1260759185\n",
+ "4 1 1172 4.0 1260759205"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " userId \n",
+ " movieId \n",
+ " rating \n",
+ " timestamp \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 31 \n",
+ " 2.5 \n",
+ " 1260759144 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 \n",
+ " 1029 \n",
+ " 3.0 \n",
+ " 1260759179 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1 \n",
+ " 1061 \n",
+ " 3.0 \n",
+ " 1260759182 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1 \n",
+ " 1129 \n",
+ " 2.0 \n",
+ " 1260759185 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1 \n",
+ " 1172 \n",
+ " 4.0 \n",
+ " 1260759205 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "ratings_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Columns of df:\", df.columns)\n",
+ "print(\"Columns of credits_df:\", credits_df.columns)\n",
+ "print(\"Columns of keywords_df:\", keywords_df.columns)\n",
+ "print(\"Columns of links_df:\", links_df.columns)\n",
+ "print(\"Columns of ratings_df:\", ratings_df.columns)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "aKEeG8RKAgDM",
+ "outputId": "fd2f5cfe-7b28-47dd-f040-0dd1152cc2e4"
+ },
+ "execution_count": 25,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Columns of df: Index(['adult', 'budget', 'genres', 'id', 'imdb_id', 'original_language',\n",
+ " 'original_title', 'overview', 'popularity', 'poster_path',\n",
+ " 'production_companies', 'production_countries', 'release_date',\n",
+ " 'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',\n",
+ " 'vote_average', 'vote_count'],\n",
+ " dtype='object')\n",
+ "Columns of credits_df: Index(['cast', 'crew', 'id'], dtype='object')\n",
+ "Columns of keywords_df: Index(['id', 'keywords'], dtype='object')\n",
+ "Columns of links_df: Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')\n",
+ "Columns of ratings_df: Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Missing values in 'id' column of df:\", df['id'].isnull().sum())\n",
+ "print(\"Missing values in 'id' column of credits_df:\", credits_df['id'].isnull().sum())\n",
+ "print(\"Missing values in 'id' column of keywords_df:\", keywords_df['id'].isnull().sum())\n",
+ "print(\"Missing values in 'tmdbId' column of links_df:\", links_df['tmdbId'].isnull().sum())\n",
+ "print(\"Missing values in 'movieId' column of links_df:\", links_df['movieId'].isnull().sum())\n",
+ "print(\"Missing values in 'movieId' column of ratings_df:\", ratings_df['movieId'].isnull().sum())"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "SsIEC5s9AiLw",
+ "outputId": "5087c752-7516-400f-bd59-7ec7e5b870ec"
+ },
+ "execution_count": 26,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Missing values in 'id' column of df: 0\n",
+ "Missing values in 'id' column of credits_df: 0\n",
+ "Missing values in 'id' column of keywords_df: 0\n",
+ "Missing values in 'tmdbId' column of links_df: 0\n",
+ "Missing values in 'movieId' column of links_df: 0\n",
+ "Missing values in 'movieId' column of ratings_df: 0\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Data type of 'id' in df:\", df['id'].dtype)\n",
+ "print(\"Data type of 'id' in credits_df:\", credits_df['id'].dtype)\n",
+ "print(\"Data type of 'id' in keywords_df:\", keywords_df['id'].dtype)\n",
+ "print(\"Data type of 'tmdbId' in links_df:\", links_df['tmdbId'].dtype)\n",
+ "print(\"Data type of 'movieId' in links_df:\", links_df['movieId'].dtype)\n",
+ "print(\"Data type of 'movieId' in ratings_df:\", ratings_df['movieId'].dtype)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "kxfDjt7jAkcy",
+ "outputId": "bd583628-6de2-4bbd-d4e7-8e4d9aa5969d"
+ },
+ "execution_count": 27,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Data type of 'id' in df: object\n",
+ "Data type of 'id' in credits_df: int64\n",
+ "Data type of 'id' in keywords_df: int64\n",
+ "Data type of 'tmdbId' in links_df: float64\n",
+ "Data type of 'movieId' in links_df: int64\n",
+ "Data type of 'movieId' in ratings_df: int64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['id'] = pd.to_numeric(df['id'], errors='coerce')\n",
+ "df.dropna(subset=['id'], inplace=True)\n",
+ "df['id'] = df['id'].astype(int)\n",
+ "\n",
+ "links_df.dropna(subset=['tmdbId'], inplace=True)\n",
+ "links_df['tmdbId'] = links_df['tmdbId'].astype(int)\n",
+ "\n",
+ "print(\"Updated data type of 'id' in df:\", df['id'].dtype)\n",
+ "print(\"Updated data type of 'tmdbId' in links_df:\", links_df['tmdbId'].dtype)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bFS3-mfoAoBe",
+ "outputId": "5d7dedbf-74f0-4456-91da-45c324695ce2"
+ },
+ "execution_count": 28,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Updated data type of 'id' in df: int64\n",
+ "Updated data type of 'tmdbId' in links_df: int64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Handling jsons"
+ ],
+ "metadata": {
+ "id": "en2HwfRJ0Ob1"
+ }
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "f7780393",
+ "outputId": "43b0c336-5653-40e8-f639-6483ccd3d813"
+ },
+ "source": [
+ "import json\n",
+ "\n",
+ "def is_json_like(value):\n",
+ " \"\"\"Checks if a string value is likely a JSON array or object.\"\"\"\n",
+ " if isinstance(value, str):\n",
+ " value = value.strip()\n",
+ " if (value.startswith('[') and value.endswith(']')) or \\\n",
+ " (value.startswith('{') and value.endswith('}')):\n",
+ " try:\n",
+ " json.loads(value)\n",
+ " return True\n",
+ " except json.JSONDecodeError:\n",
+ " pass\n",
+ " return False\n",
+ "\n",
+ "# Identify columns that might contain JSON\n",
+ "json_columns = []\n",
+ "for col in df.columns:\n",
+ " # Check a sample of non-null values to see if they are json-like\n",
+ " sample_values = df[col].dropna().sample(min(100, len(df[col].dropna())))\n",
+ " if any(sample_values.apply(is_json_like)):\n",
+ " json_columns.append(col)\n",
+ "\n",
+ "print(\"Columns that appear to contain JSON format:\", json_columns)"
+ ],
+ "execution_count": 29,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Columns that appear to contain JSON format: ['genres', 'production_companies', 'production_countries', 'spoken_languages']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "copy_df = df.copy()"
+ ],
+ "metadata": {
+ "id": "L_2EfSbDCYgt"
+ },
+ "execution_count": 30,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import ast\n",
+ "import numpy as np\n",
+ "\n",
+ "def extract_names_and_handle_empty(json_list_string):\n",
+ " \"\"\"Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN.\"\"\"\n",
+ " if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):\n",
+ " try:\n",
+ " # Use ast.literal_eval to safely evaluate the string as a Python literal\n",
+ " data_list = ast.literal_eval(json_list_string)\n",
+ " if isinstance(data_list, list):\n",
+ " if not data_list: # Check if the list is empty\n",
+ " return np.nan # Replace empty list with NaN\n",
+ " names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item]\n",
+ " return ', '.join(names)\n",
+ " except (ValueError, SyntaxError):\n",
+ " pass\n",
+ " return '' # Return empty string for non-list strings or evaluation errors\n",
+ "\n",
+ "# Columns identified as containing JSON format\n",
+ "json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']\n",
+ "\n",
+ "# Apply the extraction function and handle empty lists to copy_df\n",
+ "for col in json_columns:\n",
+ " if col in copy_df.columns:\n",
+ " copy_df[col] = copy_df[col].apply(extract_names_and_handle_empty)\n",
+ " # Now fill the NaN values (which were empty lists) with 'Unknown'\n",
+ " copy_df[col] = copy_df[col].fillna('Unknown')\n",
+ "\n",
+ "# Display the head of the modified copy_df DataFrame to confirm\n",
+ "display(copy_df.head().transpose())"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 742
+ },
+ "id": "OJcNQGcDFoJj",
+ "outputId": "7233b53c-5dad-4557-f48a-ecdbafa6425f"
+ },
+ "execution_count": 31,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " 0 \\\n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "genres Animation, Comedy, Family \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "\n",
+ " 1 \\\n",
+ "adult False \n",
+ "budget 65000000 \n",
+ "genres Adventure, Fantasy, Family \n",
+ "id 8844 \n",
+ "imdb_id tt0113497 \n",
+ "original_language en \n",
+ "original_title Jumanji \n",
+ "overview When siblings Judy and Peter discover an encha... \n",
+ "popularity 17.015539 \n",
+ "poster_path /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg \n",
+ "production_companies TriStar Pictures, Teitler Film, Interscope Com... \n",
+ "production_countries United States of America \n",
+ "release_date 1995-12-15 \n",
+ "revenue 262797249.0 \n",
+ "runtime 104.0 \n",
+ "spoken_languages English, Français \n",
+ "status Released \n",
+ "title Jumanji \n",
+ "video False \n",
+ "vote_average 6.9 \n",
+ "vote_count 2413.0 \n",
+ "\n",
+ " 2 \\\n",
+ "adult False \n",
+ "budget 0 \n",
+ "genres Romance, Comedy \n",
+ "id 15602 \n",
+ "imdb_id tt0113228 \n",
+ "original_language en \n",
+ "original_title Grumpier Old Men \n",
+ "overview A family wedding reignites the ancient feud be... \n",
+ "popularity 11.7129 \n",
+ "poster_path /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg \n",
+ "production_companies Warner Bros., Lancaster Gate \n",
+ "production_countries United States of America \n",
+ "release_date 1995-12-22 \n",
+ "revenue 0.0 \n",
+ "runtime 101.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Grumpier Old Men \n",
+ "video False \n",
+ "vote_average 6.5 \n",
+ "vote_count 92.0 \n",
+ "\n",
+ " 3 \\\n",
+ "adult False \n",
+ "budget 16000000 \n",
+ "genres Comedy, Drama, Romance \n",
+ "id 31357 \n",
+ "imdb_id tt0114885 \n",
+ "original_language en \n",
+ "original_title Waiting to Exhale \n",
+ "overview Cheated on, mistreated and stepped on, the wom... \n",
+ "popularity 3.859495 \n",
+ "poster_path /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg \n",
+ "production_companies Twentieth Century Fox Film Corporation \n",
+ "production_countries United States of America \n",
+ "release_date 1995-12-22 \n",
+ "revenue 81452156.0 \n",
+ "runtime 127.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Waiting to Exhale \n",
+ "video False \n",
+ "vote_average 6.1 \n",
+ "vote_count 34.0 \n",
+ "\n",
+ " 4 \n",
+ "adult False \n",
+ "budget 0 \n",
+ "genres Comedy \n",
+ "id 11862 \n",
+ "imdb_id tt0113041 \n",
+ "original_language en \n",
+ "original_title Father of the Bride Part II \n",
+ "overview Just when George Banks has recovered from his ... \n",
+ "popularity 8.387519 \n",
+ "poster_path /e64sOI48hQXyru7naBFyssKFxVd.jpg \n",
+ "production_companies Sandollar Productions, Touchstone Pictures \n",
+ "production_countries United States of America \n",
+ "release_date 1995-02-10 \n",
+ "revenue 76578911.0 \n",
+ "runtime 106.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Father of the Bride Part II \n",
+ "video False \n",
+ "vote_average 5.7 \n",
+ "vote_count 173.0 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " adult \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " budget \n",
+ " 30000000 \n",
+ " 65000000 \n",
+ " 0 \n",
+ " 16000000 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " genres \n",
+ " Animation, Comedy, Family \n",
+ " Adventure, Fantasy, Family \n",
+ " Romance, Comedy \n",
+ " Comedy, Drama, Romance \n",
+ " Comedy \n",
+ " \n",
+ " \n",
+ " id \n",
+ " 862 \n",
+ " 8844 \n",
+ " 15602 \n",
+ " 31357 \n",
+ " 11862 \n",
+ " \n",
+ " \n",
+ " imdb_id \n",
+ " tt0114709 \n",
+ " tt0113497 \n",
+ " tt0113228 \n",
+ " tt0114885 \n",
+ " tt0113041 \n",
+ " \n",
+ " \n",
+ " original_language \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " \n",
+ " \n",
+ " original_title \n",
+ " Toy Story \n",
+ " Jumanji \n",
+ " Grumpier Old Men \n",
+ " Waiting to Exhale \n",
+ " Father of the Bride Part II \n",
+ " \n",
+ " \n",
+ " overview \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " When siblings Judy and Peter discover an encha... \n",
+ " A family wedding reignites the ancient feud be... \n",
+ " Cheated on, mistreated and stepped on, the wom... \n",
+ " Just when George Banks has recovered from his ... \n",
+ " \n",
+ " \n",
+ " popularity \n",
+ " 21.946943 \n",
+ " 17.015539 \n",
+ " 11.7129 \n",
+ " 3.859495 \n",
+ " 8.387519 \n",
+ " \n",
+ " \n",
+ " poster_path \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg \n",
+ " /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg \n",
+ " /16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg \n",
+ " /e64sOI48hQXyru7naBFyssKFxVd.jpg \n",
+ " \n",
+ " \n",
+ " production_companies \n",
+ " Pixar Animation Studios \n",
+ " TriStar Pictures, Teitler Film, Interscope Com... \n",
+ " Warner Bros., Lancaster Gate \n",
+ " Twentieth Century Fox Film Corporation \n",
+ " Sandollar Productions, Touchstone Pictures \n",
+ " \n",
+ " \n",
+ " production_countries \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " \n",
+ " \n",
+ " release_date \n",
+ " 1995-10-30 \n",
+ " 1995-12-15 \n",
+ " 1995-12-22 \n",
+ " 1995-12-22 \n",
+ " 1995-02-10 \n",
+ " \n",
+ " \n",
+ " revenue \n",
+ " 373554033.0 \n",
+ " 262797249.0 \n",
+ " 0.0 \n",
+ " 81452156.0 \n",
+ " 76578911.0 \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 81.0 \n",
+ " 104.0 \n",
+ " 101.0 \n",
+ " 127.0 \n",
+ " 106.0 \n",
+ " \n",
+ " \n",
+ " spoken_languages \n",
+ " English \n",
+ " English, Français \n",
+ " English \n",
+ " English \n",
+ " English \n",
+ " \n",
+ " \n",
+ " status \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " \n",
+ " \n",
+ " title \n",
+ " Toy Story \n",
+ " Jumanji \n",
+ " Grumpier Old Men \n",
+ " Waiting to Exhale \n",
+ " Father of the Bride Part II \n",
+ " \n",
+ " \n",
+ " video \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 7.7 \n",
+ " 6.9 \n",
+ " 6.5 \n",
+ " 6.1 \n",
+ " 5.7 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 5415.0 \n",
+ " 2413.0 \n",
+ " 92.0 \n",
+ " 34.0 \n",
+ " 173.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(copy_df\",\n \"rows\": 21,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"False\",\n false,\n \"English\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"False\",\n false,\n \"English, Fran\\u00e7ais\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 2,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 19,\n \"samples\": [\n \"False\",\n \"en\",\n \"United States of America\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 3,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"False\",\n false,\n \"English\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 4,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 20,\n \"samples\": [\n \"False\",\n false,\n \"English\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dd558598",
+ "outputId": "1d9c7fa9-5a4d-44cd-8103-bcf533a62f06"
+ },
+ "source": [
+ "parsed_json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']\n",
+ "\n",
+ "print(\"Count and percentage of 'Unknown' values in parsed JSON columns:\")\n",
+ "for col in parsed_json_columns:\n",
+ " if col in copy_df.columns:\n",
+ " unknown_count = (copy_df[col] == 'Unknown').sum()\n",
+ " total_count = len(copy_df[col])\n",
+ " unknown_percentage = (unknown_count / total_count) * 100 if total_count > 0 else 0\n",
+ " print(f\"- {col}: {unknown_count} ('Unknown' values) out of {total_count} (Total values) - {unknown_percentage:.2f}%\")"
+ ],
+ "execution_count": 32,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Count and percentage of 'Unknown' values in parsed JSON columns:\n",
+ "- genres: 2442 ('Unknown' values) out of 45463 (Total values) - 5.37%\n",
+ "- production_companies: 11875 ('Unknown' values) out of 45463 (Total values) - 26.12%\n",
+ "- production_countries: 6282 ('Unknown' values) out of 45463 (Total values) - 13.82%\n",
+ "- spoken_languages: 3829 ('Unknown' values) out of 45463 (Total values) - 8.42%\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = copy_df.copy()"
+ ],
+ "metadata": {
+ "id": "cCvg-le_JBZg"
+ },
+ "execution_count": 33,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "e0760699",
+ "outputId": "59765d4f-310c-459e-da46-2d5a6d26f4d2"
+ },
+ "source": [
+ "import json\n",
+ "\n",
+ "def is_json_like(value):\n",
+ " \"\"\"Checks if a string value is likely a JSON array or object.\"\"\"\n",
+ " if isinstance(value, str):\n",
+ " value = value.strip()\n",
+ " if (value.startswith('[') and value.endswith(']')) or \\\n",
+ " (value.startswith('{') and value.endswith('}')):\n",
+ " try:\n",
+ " json.loads(value)\n",
+ " return True\n",
+ " except json.JSONDecodeError:\n",
+ " pass\n",
+ " return False\n",
+ "\n",
+ "# Identify columns that might contain JSON in credits_df\n",
+ "json_columns_credits = []\n",
+ "for col in credits_df.columns:\n",
+ " # Check a sample of non-null values to see if they are json-like\n",
+ " sample_values = credits_df[col].dropna().sample(min(100, len(credits_df[col].dropna())))\n",
+ " if any(sample_values.apply(is_json_like)):\n",
+ " json_columns_credits.append(col)\n",
+ "\n",
+ "print(\"Columns in credits_df that appear to contain JSON format:\", json_columns_credits)\n",
+ "\n",
+ "# Identify columns that might contain JSON in keywords_df\n",
+ "json_columns_keywords = []\n",
+ "for col in keywords_df.columns:\n",
+ " # Check a sample of non-null values to see if they are json-like\n",
+ " sample_values = keywords_df[col].dropna().sample(min(100, len(keywords_df[col].dropna())))\n",
+ " if any(sample_values.apply(is_json_like)):\n",
+ " json_columns_keywords.append(col)\n",
+ "\n",
+ "print(\"Columns in keywords_df that appear to contain JSON format:\", json_columns_keywords)"
+ ],
+ "execution_count": 34,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Columns in credits_df that appear to contain JSON format: ['cast', 'crew']\n",
+ "Columns in keywords_df that appear to contain JSON format: ['keywords']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "fe5955cc",
+ "outputId": "129190a1-f196-4013-af61-d9e7c3308340"
+ },
+ "source": [
+ "import ast\n",
+ "import numpy as np\n",
+ "\n",
+ "def extract_names_from_list(json_list_string, key='name'):\n",
+ " \"\"\"Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN.\"\"\"\n",
+ " if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):\n",
+ " try:\n",
+ " data_list = ast.literal_eval(json_list_string)\n",
+ " if isinstance(data_list, list):\n",
+ " if not data_list: # Check if the list is empty\n",
+ " return np.nan # Replace empty list with NaN\n",
+ " names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item]\n",
+ " return ', '.join(names)\n",
+ " except (ValueError, SyntaxError):\n",
+ " pass\n",
+ " return '' # Return empty string for non-list strings or evaluation errors\n",
+ "\n",
+ "# Handle JSON columns in credits_df\n",
+ "# 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names\n",
+ "credits_df['cast'] = credits_df['cast'].apply(extract_names_from_list, key='name')\n",
+ "credits_df['crew'] = credits_df['crew'].apply(extract_names_from_list, key='name')\n",
+ "\n",
+ "# Fill NaN values (originally empty lists) with 'Unknown' in credits_df\n",
+ "credits_df['cast'] = credits_df['cast'].fillna('Unknown')\n",
+ "credits_df['crew'] = credits_df['crew'].fillna('Unknown')\n",
+ "\n",
+ "# Handle JSON columns in keywords_df\n",
+ "# 'keywords' column contains a list of dictionaries, we'll extract names\n",
+ "keywords_df['keywords'] = keywords_df['keywords'].apply(extract_names_from_list, key='name')\n",
+ "\n",
+ "# Fill NaN values (originally empty lists) with 'Unknown' in keywords_df\n",
+ "keywords_df['keywords'] = keywords_df['keywords'].fillna('Unknown')\n",
+ "\n",
+ "\n",
+ "print(\"credits_df after parsing and handling empty lists:\")\n",
+ "display(credits_df.head())\n",
+ "\n",
+ "print(\"\\nkeywords_df after parsing and handling empty lists:\")\n",
+ "display(keywords_df.head())"
+ ],
+ "execution_count": 35,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "credits_df after parsing and handling empty lists:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " cast \\\n",
+ "0 Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "1 Robin Williams, Jonathan Hyde, Kirsten Dunst, ... \n",
+ "2 Walter Matthau, Jack Lemmon, Ann-Margret, Soph... \n",
+ "3 Whitney Houston, Angela Bassett, Loretta Devin... \n",
+ "4 Steve Martin, Diane Keaton, Martin Short, Kimb... \n",
+ "\n",
+ " crew id \n",
+ "0 John Lasseter, Joss Whedon, Andrew Stanton, Jo... 862 \n",
+ "1 Larry J. Franco, Jonathan Hensleigh, James Hor... 8844 \n",
+ "2 Howard Deutch, Mark Steven Johnson, Mark Steve... 15602 \n",
+ "3 Forest Whitaker, Ronald Bass, Ronald Bass, Ezr... 31357 \n",
+ "4 Alan Silvestri, Elliot Davis, Nancy Meyers, Na... 11862 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " cast \n",
+ " crew \n",
+ " id \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " 862 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Robin Williams, Jonathan Hyde, Kirsten Dunst, ... \n",
+ " Larry J. Franco, Jonathan Hensleigh, James Hor... \n",
+ " 8844 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Walter Matthau, Jack Lemmon, Ann-Margret, Soph... \n",
+ " Howard Deutch, Mark Steven Johnson, Mark Steve... \n",
+ " 15602 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Whitney Houston, Angela Bassett, Loretta Devin... \n",
+ " Forest Whitaker, Ronald Bass, Ronald Bass, Ezr... \n",
+ " 31357 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Steve Martin, Diane Keaton, Martin Short, Kimb... \n",
+ " Alan Silvestri, Elliot Davis, Nancy Meyers, Na... \n",
+ " 11862 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(keywords_df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"cast\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce, Bonnie Hunt, Bebe Neuwirth, David Alan Grier, Patricia Clarkson, Adam Hann-Byrd, Laura Bell Bundy, James Handy, Gillian Barber, Brandon Obray, Cyrus Thiedeke, Gary Joseph Thorup, Leonard Zola, Lloyd Berry, Malcolm Stewart, Annabel Kershaw, Darryl Henriques, Robyn Driscoll, Peter Bryant, Sarah Gilson, Florica Vlad, June Lion, Brenda Lockmuller\",\n \"Steve Martin, Diane Keaton, Martin Short, Kimberly Williams-Paisley, George Newbern, Kieran Culkin, BD Wong, Peter Michael Goetz, Kate McGregor-Stewart, Jane Adams, Eugene Levy, Lori Alan\",\n \"Walter Matthau, Jack Lemmon, Ann-Margret, Sophia Loren, Daryl Hannah, Burgess Meredith, Kevin Pollak\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"crew\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Larry J. Franco, Jonathan Hensleigh, James Horner, Joe Johnston, Robert Dalva, Nancy Foy, Kyle Balda, James D. Bissell, Scott Kroopf, Ted Field, Robert W. Cort, Thomas E. Ackerman, Chris van Allsburg, William Teitler, Greg Taylor, Jim Strain\",\n \"Alan Silvestri, Elliot Davis, Nancy Meyers, Nancy Meyers, Albert Hackett, Charles Shyer, Adam Bernardi\",\n \"Howard Deutch, Mark Steven Johnson, Mark Steven Johnson, Jack Keller\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11260,\n \"min\": 862,\n \"max\": 31357,\n \"num_unique_values\": 5,\n \"samples\": [\n 8844,\n 11862,\n 15602\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "keywords_df after parsing and handling empty lists:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " id keywords\n",
+ "0 862 jealousy, toy, boy, friendship, friends, rival...\n",
+ "1 8844 board game, disappearance, based on children's...\n",
+ "2 15602 fishing, best friend, duringcreditsstinger, ol...\n",
+ "3 31357 based on novel, interracial relationship, sing...\n",
+ "4 11862 baby, midlife crisis, confidence, aging, daugh..."
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id \n",
+ " keywords \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 862 \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 8844 \n",
+ " board game, disappearance, based on children's... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 15602 \n",
+ " fishing, best friend, duringcreditsstinger, ol... \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 31357 \n",
+ " based on novel, interracial relationship, sing... \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 11862 \n",
+ " baby, midlife crisis, confidence, aging, daugh... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(keywords_df\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11260,\n \"min\": 862,\n \"max\": 31357,\n \"num_unique_values\": 5,\n \"samples\": [\n 8844,\n 11862,\n 15602\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"keywords\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"board game, disappearance, based on children's book, new home, recluse, giant insect\",\n \"baby, midlife crisis, confidence, aging, daughter, mother daughter relationship, pregnancy, contraception, gynecologist\",\n \"fishing, best friend, duringcreditsstinger, old men\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "d610ba47",
+ "outputId": "41164981-ed94-4a83-853c-cad37e72c45e"
+ },
+ "source": [
+ "# Columns in credits_df and keywords_df where empty lists were replaced by 'Unknown'\n",
+ "parsed_credits_columns = ['cast', 'crew']\n",
+ "parsed_keywords_columns = ['keywords']\n",
+ "\n",
+ "print(\"Count and percentage of 'Unknown' values in parsed credits_df columns:\")\n",
+ "for col in parsed_credits_columns:\n",
+ " if col in credits_df.columns:\n",
+ " unknown_count = (credits_df[col] == 'Unknown').sum()\n",
+ " total_count = len(credits_df[col])\n",
+ " unknown_percentage = (unknown_count / total_count) * 100 if total_count > 0 else 0\n",
+ " print(f\"- {col}: {unknown_count} ('Unknown' values) out of {total_count} (Total values) - {unknown_percentage:.2f}%\")\n",
+ "\n",
+ "print(\"\\nCount and percentage of 'Unknown' values in parsed keywords_df columns:\")\n",
+ "for col in parsed_keywords_columns:\n",
+ " if col in keywords_df.columns:\n",
+ " unknown_count = (keywords_df[col] == 'Unknown').sum()\n",
+ " total_count = len(keywords_df[col])\n",
+ " unknown_percentage = (unknown_count / total_count) * 100 if total_count > 0 else 0\n",
+ " print(f\"- {col}: {unknown_count} ('Unknown' values) out of {total_count} (Total values) - {unknown_percentage:.2f}%\")"
+ ],
+ "execution_count": 36,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Count and percentage of 'Unknown' values in parsed credits_df columns:\n",
+ "- cast: 2418 ('Unknown' values) out of 45476 (Total values) - 5.32%\n",
+ "- crew: 771 ('Unknown' values) out of 45476 (Total values) - 1.70%\n",
+ "\n",
+ "Count and percentage of 'Unknown' values in parsed keywords_df columns:\n",
+ "- keywords: 14795 ('Unknown' values) out of 46419 (Total values) - 31.87%\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Handling duplicates"
+ ],
+ "metadata": {
+ "id": "DScG6REm0VAu"
+ }
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "c9fe3bd2",
+ "outputId": "008e58ab-3de4-4028-e17a-f3d0f425e8eb"
+ },
+ "source": [
+ "# Check for duplicates in the key columns\n",
+ "print(\"Duplicates in 'id' column of df:\", df['id'].duplicated().sum())\n",
+ "print(\"Duplicates in 'id' column of credits_df:\", credits_df['id'].duplicated().sum())\n",
+ "print(\"Duplicates in 'id' column of keywords_df:\", keywords_df['id'].duplicated().sum())\n",
+ "print(\"Duplicates in 'tmdbId' column of links_df:\", links_df['tmdbId'].duplicated().sum())\n",
+ "print(\"Duplicates in 'movieId' column of links_df:\", links_df['movieId'].duplicated().sum())\n",
+ "print(\"Duplicates in 'imbdId' column of links_df:\", links_df['imdbId'].duplicated().sum())\n",
+ "print(\"Duplicates in 'movieId' column of ratings_df:\", ratings_df.duplicated(subset=['movieId', 'userId']).sum())"
+ ],
+ "execution_count": 37,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Duplicates in 'id' column of df: 30\n",
+ "Duplicates in 'id' column of credits_df: 44\n",
+ "Duplicates in 'id' column of keywords_df: 987\n",
+ "Duplicates in 'tmdbId' column of links_df: 0\n",
+ "Duplicates in 'movieId' column of links_df: 0\n",
+ "Duplicates in 'imbdId' column of links_df: 0\n",
+ "Duplicates in 'movieId' column of ratings_df: 0\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "72e59c70",
+ "outputId": "6657994b-470b-4933-f228-a8e07decd247"
+ },
+ "source": [
+ "# Remove duplicates from key columns\n",
+ "df.drop_duplicates(subset=['id'], inplace=True)\n",
+ "credits_df.drop_duplicates(subset=['id'], inplace=True)\n",
+ "keywords_df.drop_duplicates(subset=['id'], inplace=True)\n",
+ "links_df.drop_duplicates(subset=['tmdbId'], inplace=True)\n",
+ "ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True)\n",
+ "\n",
+ "print(\"Duplicates in 'id' column of df after removal:\", df['id'].duplicated().sum())\n",
+ "print(\"Duplicates in 'id' column of credits_df after removal:\", credits_df['id'].duplicated().sum())\n",
+ "print(\"Duplicates in 'id' column of keywords_df after removal:\", keywords_df['id'].duplicated().sum())\n",
+ "print(\"Duplicates in 'tmdbId' column of links_df after removal:\", links_df['tmdbId'].duplicated().sum()) # Re-checking for completeness\n",
+ "print(\"Duplicates in 'movieId' column of links_df after removal:\", links_df['movieId'].duplicated().sum()) # Re-checking for completeness\n",
+ "print(\"Duplicates in 'movieId' column of ratings_df after removal:\", ratings_df['movieId'].duplicated().sum())"
+ ],
+ "execution_count": 38,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Duplicates in 'id' column of df after removal: 0\n",
+ "Duplicates in 'id' column of credits_df after removal: 0\n",
+ "Duplicates in 'id' column of keywords_df after removal: 0\n",
+ "Duplicates in 'tmdbId' column of links_df after removal: 0\n",
+ "Duplicates in 'movieId' column of links_df after removal: 0\n",
+ "Duplicates in 'movieId' column of ratings_df after removal: 90938\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2381e9e3",
+ "outputId": "f3ec9bf9-9d88-4ede-c0ce-316d249adf57"
+ },
+ "source": [
+ "print(\"Shape of df:\", df.shape)\n",
+ "print(\"Shape of credits_df:\", credits_df.shape)\n",
+ "print(\"Shape of keywords_df:\", keywords_df.shape)\n",
+ "print(\"Shape of links_df:\", links_df.shape)\n",
+ "print(\"Shape of ratings_df:\", ratings_df.shape)"
+ ],
+ "execution_count": 39,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of df: (45433, 21)\n",
+ "Shape of credits_df: (45432, 3)\n",
+ "Shape of keywords_df: (45432, 2)\n",
+ "Shape of links_df: (9112, 3)\n",
+ "Shape of ratings_df: (100004, 4)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Merging dataframes"
+ ],
+ "metadata": {
+ "id": "ZtvgjUP50cM6"
+ }
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 0
+ },
+ "id": "ecb2a7f5",
+ "outputId": "76c03c53-8cef-43cf-c14e-d434bed42abe"
+ },
+ "source": [
+ "# Merge df, credits_df, and keywords_df on 'id'\n",
+ "merged_df = pd.merge(df, credits_df, on='id', how='inner')\n",
+ "merged_df = pd.merge(merged_df, keywords_df, on='id', how='inner')\n",
+ "\n",
+ "# Merge with links_df using tmdbId from links_df and id from merged_df\n",
+ "# links_df has 'movieId' (MovieLens ID) and 'tmdbId' (TMDB ID)\n",
+ "merged_df = pd.merge(merged_df, links_df, left_on='id', right_on='tmdbId', how='inner')\n",
+ "\n",
+ "# Now merge the result with ratings_df using movieId (MovieLens ID)\n",
+ "merged_df = pd.merge(merged_df, ratings_df, on='movieId', how='inner')\n",
+ "\n",
+ "print(\"Shape of the merged dataframe:\", merged_df.shape)\n",
+ "display(merged_df.head().transpose())"
+ ],
+ "execution_count": 40,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of the merged dataframe: (99810, 30)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " 0 \\\n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "genres Animation, Comedy, Family \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "movieId 1 \n",
+ "imdbId 114709 \n",
+ "tmdbId 862 \n",
+ "userId 7 \n",
+ "rating 3.0 \n",
+ "timestamp 851866703 \n",
+ "\n",
+ " 1 \\\n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "genres Animation, Comedy, Family \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "movieId 1 \n",
+ "imdbId 114709 \n",
+ "tmdbId 862 \n",
+ "userId 9 \n",
+ "rating 4.0 \n",
+ "timestamp 938629179 \n",
+ "\n",
+ " 2 \\\n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "genres Animation, Comedy, Family \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "movieId 1 \n",
+ "imdbId 114709 \n",
+ "tmdbId 862 \n",
+ "userId 13 \n",
+ "rating 5.0 \n",
+ "timestamp 1331380058 \n",
+ "\n",
+ " 3 \\\n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "genres Animation, Comedy, Family \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "movieId 1 \n",
+ "imdbId 114709 \n",
+ "tmdbId 862 \n",
+ "userId 15 \n",
+ "rating 2.0 \n",
+ "timestamp 997938310 \n",
+ "\n",
+ " 4 \n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "genres Animation, Comedy, Family \n",
+ "id 862 \n",
+ "imdb_id tt0114709 \n",
+ "original_language en \n",
+ "original_title Toy Story \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "popularity 21.946943 \n",
+ "poster_path /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "release_date 1995-10-30 \n",
+ "revenue 373554033.0 \n",
+ "runtime 81.0 \n",
+ "spoken_languages English \n",
+ "status Released \n",
+ "title Toy Story \n",
+ "video False \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "movieId 1 \n",
+ "imdbId 114709 \n",
+ "tmdbId 862 \n",
+ "userId 19 \n",
+ "rating 3.0 \n",
+ "timestamp 855190091 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " adult \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " budget \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " \n",
+ " \n",
+ " genres \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " \n",
+ " \n",
+ " id \n",
+ " 862 \n",
+ " 862 \n",
+ " 862 \n",
+ " 862 \n",
+ " 862 \n",
+ " \n",
+ " \n",
+ " imdb_id \n",
+ " tt0114709 \n",
+ " tt0114709 \n",
+ " tt0114709 \n",
+ " tt0114709 \n",
+ " tt0114709 \n",
+ " \n",
+ " \n",
+ " original_language \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " \n",
+ " \n",
+ " original_title \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " \n",
+ " \n",
+ " overview \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " \n",
+ " \n",
+ " popularity \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " \n",
+ " \n",
+ " poster_path \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " /rhIRbceoE9lR4veEXuwCC2wARtG.jpg \n",
+ " \n",
+ " \n",
+ " production_companies \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " \n",
+ " \n",
+ " production_countries \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " \n",
+ " \n",
+ " release_date \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " \n",
+ " \n",
+ " revenue \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " \n",
+ " \n",
+ " spoken_languages \n",
+ " English \n",
+ " English \n",
+ " English \n",
+ " English \n",
+ " English \n",
+ " \n",
+ " \n",
+ " status \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " \n",
+ " \n",
+ " title \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " \n",
+ " \n",
+ " video \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " \n",
+ " \n",
+ " cast \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " \n",
+ " \n",
+ " crew \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " \n",
+ " \n",
+ " keywords \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " imdbId \n",
+ " 114709 \n",
+ " 114709 \n",
+ " 114709 \n",
+ " 114709 \n",
+ " 114709 \n",
+ " \n",
+ " \n",
+ " tmdbId \n",
+ " 862 \n",
+ " 862 \n",
+ " 862 \n",
+ " 862 \n",
+ " 862 \n",
+ " \n",
+ " \n",
+ " userId \n",
+ " 7 \n",
+ " 9 \n",
+ " 13 \n",
+ " 15 \n",
+ " 19 \n",
+ " \n",
+ " \n",
+ " rating \n",
+ " 3.0 \n",
+ " 4.0 \n",
+ " 5.0 \n",
+ " 2.0 \n",
+ " 3.0 \n",
+ " \n",
+ " \n",
+ " timestamp \n",
+ " 851866703 \n",
+ " 938629179 \n",
+ " 1331380058 \n",
+ " 997938310 \n",
+ " 855190091 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(merged_df\",\n \"rows\": 30,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 28,\n \"samples\": [\n \"/rhIRbceoE9lR4veEXuwCC2wARtG.jpg\",\n 7,\n 21.946943\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 28,\n \"samples\": [\n \"/rhIRbceoE9lR4veEXuwCC2wARtG.jpg\",\n 9,\n 21.946943\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 2,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 28,\n \"samples\": [\n \"/rhIRbceoE9lR4veEXuwCC2wARtG.jpg\",\n 13,\n 21.946943\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 3,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 28,\n \"samples\": [\n \"/rhIRbceoE9lR4veEXuwCC2wARtG.jpg\",\n 15,\n 21.946943\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 4,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 28,\n \"samples\": [\n \"/rhIRbceoE9lR4veEXuwCC2wARtG.jpg\",\n 19,\n 21.946943\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Feature engineering"
+ ],
+ "metadata": {
+ "id": "f2zJ1cYb0tEU"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Q-KMnHHsAEN5",
+ "outputId": "bd6e0c60-021e-4b2d-d04f-b0386f4700e5"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(99810, 30)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 184
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Choosing appropiate features"
+ ],
+ "metadata": {
+ "id": "Ei5X6Yc_xDHO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 795
+ },
+ "id": "31f7cd73",
+ "outputId": "1714d1d9-034e-4d4a-f918-29f1e893a6a0"
+ },
+ "source": [
+ "# Drop redundant ID columns\n",
+ "merged_df = merged_df.drop(columns=['id', 'tmdbId', 'imdbId', 'imdb_id', 'original_title', 'video'])\n",
+ "\n",
+ "# Define the desired column order\n",
+ "desired_column_order = [\n",
+ " 'movieId',\n",
+ " 'title',\n",
+ " 'release_date',\n",
+ " 'runtime',\n",
+ " 'status',\n",
+ " 'adult',\n",
+ " 'budget',\n",
+ " 'revenue',\n",
+ " 'popularity',\n",
+ " 'vote_average',\n",
+ " 'vote_count',\n",
+ " 'overview',\n",
+ " 'genres',\n",
+ " 'keywords',\n",
+ " 'cast',\n",
+ " 'crew',\n",
+ " 'production_companies',\n",
+ " 'production_countries',\n",
+ " 'original_language',\n",
+ " 'userId',\n",
+ " 'rating',\n",
+ "]\n",
+ "\n",
+ "# Reindex the DataFrame with the desired column order\n",
+ "merged_df = merged_df.reindex(columns=desired_column_order)\n",
+ "\n",
+ "print(\"Shape of merged_df after dropping columns and reordering:\", merged_df.shape)\n",
+ "display(merged_df.head().transpose())"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of merged_df after dropping columns and reordering: (99810, 21)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " 0 \\\n",
+ "movieId 1 \n",
+ "title Toy Story \n",
+ "release_date 1995-10-30 \n",
+ "runtime 81.0 \n",
+ "status Released \n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "revenue 373554033.0 \n",
+ "popularity 21.946943 \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "genres Animation, Comedy, Family \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "original_language en \n",
+ "userId 7 \n",
+ "rating 3.0 \n",
+ "\n",
+ " 1 \\\n",
+ "movieId 1 \n",
+ "title Toy Story \n",
+ "release_date 1995-10-30 \n",
+ "runtime 81.0 \n",
+ "status Released \n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "revenue 373554033.0 \n",
+ "popularity 21.946943 \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "genres Animation, Comedy, Family \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "original_language en \n",
+ "userId 9 \n",
+ "rating 4.0 \n",
+ "\n",
+ " 2 \\\n",
+ "movieId 1 \n",
+ "title Toy Story \n",
+ "release_date 1995-10-30 \n",
+ "runtime 81.0 \n",
+ "status Released \n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "revenue 373554033.0 \n",
+ "popularity 21.946943 \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "genres Animation, Comedy, Family \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "original_language en \n",
+ "userId 13 \n",
+ "rating 5.0 \n",
+ "\n",
+ " 3 \\\n",
+ "movieId 1 \n",
+ "title Toy Story \n",
+ "release_date 1995-10-30 \n",
+ "runtime 81.0 \n",
+ "status Released \n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "revenue 373554033.0 \n",
+ "popularity 21.946943 \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "genres Animation, Comedy, Family \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "original_language en \n",
+ "userId 15 \n",
+ "rating 2.0 \n",
+ "\n",
+ " 4 \n",
+ "movieId 1 \n",
+ "title Toy Story \n",
+ "release_date 1995-10-30 \n",
+ "runtime 81.0 \n",
+ "status Released \n",
+ "adult False \n",
+ "budget 30000000 \n",
+ "revenue 373554033.0 \n",
+ "popularity 21.946943 \n",
+ "vote_average 7.7 \n",
+ "vote_count 5415.0 \n",
+ "overview Led by Woody, Andy's toys live happily in his ... \n",
+ "genres Animation, Comedy, Family \n",
+ "keywords jealousy, toy, boy, friendship, friends, rival... \n",
+ "cast Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ "crew John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ "production_companies Pixar Animation Studios \n",
+ "production_countries United States of America \n",
+ "original_language en \n",
+ "userId 19 \n",
+ "rating 3.0 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " title \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " \n",
+ " \n",
+ " release_date \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " 1995-10-30 \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " \n",
+ " \n",
+ " status \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " \n",
+ " \n",
+ " adult \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " False \n",
+ " \n",
+ " \n",
+ " budget \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " \n",
+ " \n",
+ " revenue \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " 373554033.0 \n",
+ " \n",
+ " \n",
+ " popularity \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " 21.946943 \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " 7.7 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " 5415.0 \n",
+ " \n",
+ " \n",
+ " overview \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " Led by Woody, Andy's toys live happily in his ... \n",
+ " \n",
+ " \n",
+ " genres \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " Animation, Comedy, Family \n",
+ " \n",
+ " \n",
+ " keywords \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " jealousy, toy, boy, friendship, friends, rival... \n",
+ " \n",
+ " \n",
+ " cast \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n",
+ " \n",
+ " \n",
+ " crew \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n",
+ " \n",
+ " \n",
+ " production_companies \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " Pixar Animation Studios \n",
+ " \n",
+ " \n",
+ " production_countries \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " United States of America \n",
+ " \n",
+ " \n",
+ " original_language \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " en \n",
+ " \n",
+ " \n",
+ " userId \n",
+ " 7 \n",
+ " 9 \n",
+ " 13 \n",
+ " 15 \n",
+ " 19 \n",
+ " \n",
+ " \n",
+ " rating \n",
+ " 3.0 \n",
+ " 4.0 \n",
+ " 5.0 \n",
+ " 2.0 \n",
+ " 3.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(merged_df\",\n \"rows\": 21,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21,\n \"samples\": [\n 1,\n \"United States of America\",\n \"John Lasseter, Joss Whedon, Andrew Stanton, Joel Cohen, Alec Sokolow, Bonnie Arnold, Ed Catmull, Ralph Guggenheim, Steve Jobs, Lee Unkrich, Ralph Eggleston, Robert Gordon, Mary Helen Leasman, Kim Blanchette, Marilyn McCoppen, Randy Newman, Dale E. Grahn, Robin Cooper, John Lasseter, Pete Docter, Joe Ranft, Patsy Bouge, Norm DeCarlo, Ash Brannon, Randy Newman, Roman Figun, Don Davis, James Flamberg, Mary Beth Smith, Rick Mackay, Susan Bradley, William Reeves, Randy Newman, Andrew Stanton, Pete Docter, Gary Rydstrom, Karen Robert Jackson, Chris Montan, Rich Quade, Michael Berenstein, Colin Brady, Davey Crockett Feiten, Angie Glocka, Rex Grignon, Tom K. Gurney, Jimmy Hayward, Hal T. Hickel, Karen Kiser, Anthony B. LaMolinara, Guionne Leroy, Bud Luckey, Les Major, Glenn McQueen, Mark Oftedal, Jeff Pidgeon, Jeff Pratt, Steve Rabatich, Roger Rose, Steve Segal, Doug Sheppeck, Alan Sperling, Doug Sweetland, David Tart, Ken Willard, Thomas Porter, Mark Thomas Henne, Oren Jacob, Darwyn Peachey, Mitch Prater, Brian M. Rosen, Sharon Calahan, Galyn Susman, William Cone, Shelley Daniels Lekven, Bob Pauley, Bud Luckey, Andrew Stanton, William Cone, Steve Johnson, Dan Haskett, Tom Holloway, Jean Gillmore, Desir\\u00e9e Mourad, Kelly O'Connell, Sonoko Konishi, Ann M. Rockwell, Julie M. McDonald, Robin Lee, Tom Freeman, Ada Cochavi, Dana Mulligan, Deirdre Morrison, Lori Lombardo, Ellen Devine, Lauren Beth Strogoff, Gary Rydstrom, Gary Summers, Tim Holland, Pat Jackson, Tom Myers, J.R. Grubbs, Susan Sanford, Susan Popovic, Dan Engstrom, Ruth Lambert, Mickie McGowan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21,\n \"samples\": [\n 1,\n \"United States of America\",\n \"John Lasseter, Joss Whedon, Andrew Stanton, Joel Cohen, Alec Sokolow, Bonnie Arnold, Ed Catmull, Ralph Guggenheim, Steve Jobs, Lee Unkrich, Ralph Eggleston, Robert Gordon, Mary Helen Leasman, Kim Blanchette, Marilyn McCoppen, Randy Newman, Dale E. Grahn, Robin Cooper, John Lasseter, Pete Docter, Joe Ranft, Patsy Bouge, Norm DeCarlo, Ash Brannon, Randy Newman, Roman Figun, Don Davis, James Flamberg, Mary Beth Smith, Rick Mackay, Susan Bradley, William Reeves, Randy Newman, Andrew Stanton, Pete Docter, Gary Rydstrom, Karen Robert Jackson, Chris Montan, Rich Quade, Michael Berenstein, Colin Brady, Davey Crockett Feiten, Angie Glocka, Rex Grignon, Tom K. Gurney, Jimmy Hayward, Hal T. Hickel, Karen Kiser, Anthony B. LaMolinara, Guionne Leroy, Bud Luckey, Les Major, Glenn McQueen, Mark Oftedal, Jeff Pidgeon, Jeff Pratt, Steve Rabatich, Roger Rose, Steve Segal, Doug Sheppeck, Alan Sperling, Doug Sweetland, David Tart, Ken Willard, Thomas Porter, Mark Thomas Henne, Oren Jacob, Darwyn Peachey, Mitch Prater, Brian M. Rosen, Sharon Calahan, Galyn Susman, William Cone, Shelley Daniels Lekven, Bob Pauley, Bud Luckey, Andrew Stanton, William Cone, Steve Johnson, Dan Haskett, Tom Holloway, Jean Gillmore, Desir\\u00e9e Mourad, Kelly O'Connell, Sonoko Konishi, Ann M. Rockwell, Julie M. McDonald, Robin Lee, Tom Freeman, Ada Cochavi, Dana Mulligan, Deirdre Morrison, Lori Lombardo, Ellen Devine, Lauren Beth Strogoff, Gary Rydstrom, Gary Summers, Tim Holland, Pat Jackson, Tom Myers, J.R. Grubbs, Susan Sanford, Susan Popovic, Dan Engstrom, Ruth Lambert, Mickie McGowan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 2,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21,\n \"samples\": [\n 1,\n \"United States of America\",\n \"John Lasseter, Joss Whedon, Andrew Stanton, Joel Cohen, Alec Sokolow, Bonnie Arnold, Ed Catmull, Ralph Guggenheim, Steve Jobs, Lee Unkrich, Ralph Eggleston, Robert Gordon, Mary Helen Leasman, Kim Blanchette, Marilyn McCoppen, Randy Newman, Dale E. Grahn, Robin Cooper, John Lasseter, Pete Docter, Joe Ranft, Patsy Bouge, Norm DeCarlo, Ash Brannon, Randy Newman, Roman Figun, Don Davis, James Flamberg, Mary Beth Smith, Rick Mackay, Susan Bradley, William Reeves, Randy Newman, Andrew Stanton, Pete Docter, Gary Rydstrom, Karen Robert Jackson, Chris Montan, Rich Quade, Michael Berenstein, Colin Brady, Davey Crockett Feiten, Angie Glocka, Rex Grignon, Tom K. Gurney, Jimmy Hayward, Hal T. Hickel, Karen Kiser, Anthony B. LaMolinara, Guionne Leroy, Bud Luckey, Les Major, Glenn McQueen, Mark Oftedal, Jeff Pidgeon, Jeff Pratt, Steve Rabatich, Roger Rose, Steve Segal, Doug Sheppeck, Alan Sperling, Doug Sweetland, David Tart, Ken Willard, Thomas Porter, Mark Thomas Henne, Oren Jacob, Darwyn Peachey, Mitch Prater, Brian M. Rosen, Sharon Calahan, Galyn Susman, William Cone, Shelley Daniels Lekven, Bob Pauley, Bud Luckey, Andrew Stanton, William Cone, Steve Johnson, Dan Haskett, Tom Holloway, Jean Gillmore, Desir\\u00e9e Mourad, Kelly O'Connell, Sonoko Konishi, Ann M. Rockwell, Julie M. McDonald, Robin Lee, Tom Freeman, Ada Cochavi, Dana Mulligan, Deirdre Morrison, Lori Lombardo, Ellen Devine, Lauren Beth Strogoff, Gary Rydstrom, Gary Summers, Tim Holland, Pat Jackson, Tom Myers, J.R. Grubbs, Susan Sanford, Susan Popovic, Dan Engstrom, Ruth Lambert, Mickie McGowan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 3,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21,\n \"samples\": [\n 1,\n \"United States of America\",\n \"John Lasseter, Joss Whedon, Andrew Stanton, Joel Cohen, Alec Sokolow, Bonnie Arnold, Ed Catmull, Ralph Guggenheim, Steve Jobs, Lee Unkrich, Ralph Eggleston, Robert Gordon, Mary Helen Leasman, Kim Blanchette, Marilyn McCoppen, Randy Newman, Dale E. Grahn, Robin Cooper, John Lasseter, Pete Docter, Joe Ranft, Patsy Bouge, Norm DeCarlo, Ash Brannon, Randy Newman, Roman Figun, Don Davis, James Flamberg, Mary Beth Smith, Rick Mackay, Susan Bradley, William Reeves, Randy Newman, Andrew Stanton, Pete Docter, Gary Rydstrom, Karen Robert Jackson, Chris Montan, Rich Quade, Michael Berenstein, Colin Brady, Davey Crockett Feiten, Angie Glocka, Rex Grignon, Tom K. Gurney, Jimmy Hayward, Hal T. Hickel, Karen Kiser, Anthony B. LaMolinara, Guionne Leroy, Bud Luckey, Les Major, Glenn McQueen, Mark Oftedal, Jeff Pidgeon, Jeff Pratt, Steve Rabatich, Roger Rose, Steve Segal, Doug Sheppeck, Alan Sperling, Doug Sweetland, David Tart, Ken Willard, Thomas Porter, Mark Thomas Henne, Oren Jacob, Darwyn Peachey, Mitch Prater, Brian M. Rosen, Sharon Calahan, Galyn Susman, William Cone, Shelley Daniels Lekven, Bob Pauley, Bud Luckey, Andrew Stanton, William Cone, Steve Johnson, Dan Haskett, Tom Holloway, Jean Gillmore, Desir\\u00e9e Mourad, Kelly O'Connell, Sonoko Konishi, Ann M. Rockwell, Julie M. McDonald, Robin Lee, Tom Freeman, Ada Cochavi, Dana Mulligan, Deirdre Morrison, Lori Lombardo, Ellen Devine, Lauren Beth Strogoff, Gary Rydstrom, Gary Summers, Tim Holland, Pat Jackson, Tom Myers, J.R. Grubbs, Susan Sanford, Susan Popovic, Dan Engstrom, Ruth Lambert, Mickie McGowan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 4,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21,\n \"samples\": [\n 1,\n \"United States of America\",\n \"John Lasseter, Joss Whedon, Andrew Stanton, Joel Cohen, Alec Sokolow, Bonnie Arnold, Ed Catmull, Ralph Guggenheim, Steve Jobs, Lee Unkrich, Ralph Eggleston, Robert Gordon, Mary Helen Leasman, Kim Blanchette, Marilyn McCoppen, Randy Newman, Dale E. Grahn, Robin Cooper, John Lasseter, Pete Docter, Joe Ranft, Patsy Bouge, Norm DeCarlo, Ash Brannon, Randy Newman, Roman Figun, Don Davis, James Flamberg, Mary Beth Smith, Rick Mackay, Susan Bradley, William Reeves, Randy Newman, Andrew Stanton, Pete Docter, Gary Rydstrom, Karen Robert Jackson, Chris Montan, Rich Quade, Michael Berenstein, Colin Brady, Davey Crockett Feiten, Angie Glocka, Rex Grignon, Tom K. Gurney, Jimmy Hayward, Hal T. Hickel, Karen Kiser, Anthony B. LaMolinara, Guionne Leroy, Bud Luckey, Les Major, Glenn McQueen, Mark Oftedal, Jeff Pidgeon, Jeff Pratt, Steve Rabatich, Roger Rose, Steve Segal, Doug Sheppeck, Alan Sperling, Doug Sweetland, David Tart, Ken Willard, Thomas Porter, Mark Thomas Henne, Oren Jacob, Darwyn Peachey, Mitch Prater, Brian M. Rosen, Sharon Calahan, Galyn Susman, William Cone, Shelley Daniels Lekven, Bob Pauley, Bud Luckey, Andrew Stanton, William Cone, Steve Johnson, Dan Haskett, Tom Holloway, Jean Gillmore, Desir\\u00e9e Mourad, Kelly O'Connell, Sonoko Konishi, Ann M. Rockwell, Julie M. McDonald, Robin Lee, Tom Freeman, Ada Cochavi, Dana Mulligan, Deirdre Morrison, Lori Lombardo, Ellen Devine, Lauren Beth Strogoff, Gary Rydstrom, Gary Summers, Tim Holland, Pat Jackson, Tom Myers, J.R. Grubbs, Susan Sanford, Susan Popovic, Dan Engstrom, Ruth Lambert, Mickie McGowan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df['budget'] = pd.to_numeric(merged_df['budget'], errors='coerce').fillna(0)\n",
+ "merged_df['revenue'] = pd.to_numeric(merged_df['revenue'], errors='coerce').fillna(0)\n",
+ "merged_df = merged_df[merged_df['runtime'] > 0]\n",
+ "merged_df = merged_df[merged_df['budget'] >= 0]\n",
+ "merged_df = merged_df[merged_df['revenue'] >= 0]\n",
+ "# Remove extreme outliers (top 0.5% for budget/revenue)\n",
+ "for col in ['budget', 'revenue']:\n",
+ " upper = merged_df[col].quantile(0.995)\n",
+ " merged_df = merged_df[merged_df[col] <= upper]"
+ ],
+ "metadata": {
+ "id": "BT_sc2F4ji6A"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df['budget'] = pd.to_numeric(merged_df['budget'], errors='coerce').fillna(0)\n",
+ "merged_df['revenue'] = pd.to_numeric(merged_df['revenue'], errors='coerce').fillna(0)\n",
+ "merged_df['budget_to_revenue_ratio'] = merged_df.apply(lambda row: row['budget'] / row['revenue'] if row['revenue'] > 0 else 0, axis=1)"
+ ],
+ "metadata": {
+ "id": "Itgd7bDHk1rK"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "top_n = 5"
+ ],
+ "metadata": {
+ "id": "uC2P65wwltnY"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "genre_dummies = merged_df['genres'].str.get_dummies(sep=', ')\n",
+ "top_genres = genre_dummies.sum().sort_values(ascending=False).head(top_n).index\n",
+ "for genre in top_genres:\n",
+ " merged_df[f\"genre_{genre}\"] = genre_dummies[genre]"
+ ],
+ "metadata": {
+ "id": "1TISuKg-k6M5"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "top_genres"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Kfo8toO_mqcA",
+ "outputId": "b03a5734-49de-48ad-81db-e6f8f540d3af"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Index(['Drama', 'Comedy', 'Thriller', 'Action', 'Adventure'], dtype='object')"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 190
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "for col in ['budget', 'revenue', 'popularity', 'vote_count']:\n",
+ " merged_df[f'log_{col}'] = np.log1p(merged_df[col])"
+ ],
+ "metadata": {
+ "id": "JaAhpHqylD_3"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Qtzd9FczxrHq",
+ "outputId": "4b3f24bf-df40-48b1-e150-8e38704f3c87"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(98845, 31)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 192
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df['budget_x_popularity'] = merged_df['budget'] * merged_df['popularity']\n",
+ "merged_df['budget_x_vote_count'] = merged_df['budget'] * merged_df['vote_count']"
+ ],
+ "metadata": {
+ "id": "LDimYRMElF1h"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df['num_genres'] = merged_df['genres'].fillna('').apply(lambda x: len([g for g in x.split(',') if g.strip()]))\n",
+ "merged_df['num_keywords'] = merged_df['keywords'].fillna('').apply(lambda x: len([k for k in x.split(',') if k.strip()]))\n",
+ "merged_df['num_cast'] = merged_df['cast'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))\n",
+ "merged_df['num_crew'] = merged_df['crew'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))"
+ ],
+ "metadata": {
+ "id": "yzC_M62MlI6d"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df['overview_length'] = merged_df['overview'].fillna('').apply(len)\n",
+ "merged_df['title_length'] = merged_df['title'].fillna('').apply(len)"
+ ],
+ "metadata": {
+ "id": "wmmVFufHlMog"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "genre_ratings = {}\n",
+ "for genre in merged_df['genres'].str.split(',').explode().str.strip().unique():\n",
+ " if genre and genre != 'Unknown':\n",
+ " mask = merged_df['genres'].str.contains(rf'\\b{genre}\\b', regex=True)\n",
+ " genre_ratings[genre] = merged_df.loc[mask, 'vote_average'].mean()\n",
+ "for genre in list(genre_ratings.keys())[:10]:\n",
+ " merged_df[f'genre_{genre}_mean_vote'] = merged_df['genres'].apply(\n",
+ " lambda x: genre_ratings[genre] if genre in x else np.nan\n",
+ " )"
+ ],
+ "metadata": {
+ "id": "xen0pzyplOxW"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], errors='coerce')\n",
+ "merged_df['release_year'] = merged_df['release_date'].dt.year"
+ ],
+ "metadata": {
+ "id": "AJGlMShTlakC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df.drop(columns=['release_date'], inplace=True)"
+ ],
+ "metadata": {
+ "id": "wRHSC5zqzAqZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "if 'adult' in merged_df.columns:\n",
+ " merged_df['is_adult'] = merged_df['adult'].map({'True': 1, 'False': 0})"
+ ],
+ "metadata": {
+ "id": "eTd5Siw1oLmt"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df.drop(columns=['adult'], inplace=True)"
+ ],
+ "metadata": {
+ "id": "rJKCIaB7zSYe"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "keywords=['love', 'war', 'star', 'man', 'woman']\n",
+ "merged_df['title'] = merged_df['title'].fillna('').astype(str)\n",
+ "for kw in keywords:\n",
+ " merged_df[f'title_has_{kw}'] = merged_df['title'].str.lower().str.contains(kw).astype(int)"
+ ],
+ "metadata": {
+ "id": "LWzo_inMoOLm"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "merged_df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "8c71V3QI2Zpe",
+ "outputId": "3017d478-9b7d-47a0-805b-520a34090407"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(98845, 54)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 203
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Top n's"
+ ],
+ "metadata": {
+ "id": "1YPejBj71Y8W"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "keywords_split = merged_df['keywords'].fillna('').apply(lambda x: [k.strip() for k in x.split(',') if k.strip()])\n",
+ "mlb = MultiLabelBinarizer()\n",
+ "top_keywords = pd.Series([k for sublist in keywords_split for k in sublist]).value_counts().head(top_n).index\n",
+ "keywords_filtered = keywords_split.apply(lambda x: [k for k in x if k in top_keywords])\n",
+ "keyword_dummies = pd.DataFrame(mlb.fit_transform(keywords_filtered), columns=[f'kw_{k}' for k in mlb.classes_], index=merged_df.index)\n",
+ "merged_df = pd.concat([merged_df, keyword_dummies], axis=1)"
+ ],
+ "metadata": {
+ "id": "boCkxBCOoQ-y"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "top_n_cast=5\n",
+ "top_n_crew=5\n",
+ "cast_split = merged_df['cast'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])\n",
+ "crew_split = merged_df['crew'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])\n",
+ "mlb_cast = MultiLabelBinarizer()\n",
+ "mlb_crew = MultiLabelBinarizer()\n",
+ "top_cast = pd.Series([c for sublist in cast_split for c in sublist]).value_counts().head(top_n_cast).index\n",
+ "top_crew = pd.Series([c for sublist in crew_split for c in sublist]).value_counts().head(top_n_crew).index\n",
+ "cast_filtered = cast_split.apply(lambda x: [c for c in x if c in top_cast])\n",
+ "crew_filtered = crew_split.apply(lambda x: [c for c in x if c in top_crew])\n",
+ "cast_dummies = pd.DataFrame(mlb_cast.fit_transform(cast_filtered), columns=[f'cast_{c}' for c in mlb_cast.classes_], index=merged_df.index)\n",
+ "crew_dummies = pd.DataFrame(mlb_crew.fit_transform(crew_filtered), columns=[f'crew_{c}' for c in mlb_crew.classes_], index=merged_df.index)\n",
+ "merged_df = pd.concat([merged_df, cast_dummies, crew_dummies], axis=1)"
+ ],
+ "metadata": {
+ "id": "VHSULLd3oV6c"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "top_n_company=5\n",
+ "top_n_country=5\n",
+ "company_split = merged_df['production_companies'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])\n",
+ "country_split = merged_df['production_countries'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])\n",
+ "mlb_company = MultiLabelBinarizer()\n",
+ "mlb_country = MultiLabelBinarizer()\n",
+ "top_company = pd.Series([c for sublist in company_split for c in sublist]).value_counts().head(top_n_company).index\n",
+ "top_country = pd.Series([c for sublist in country_split for c in sublist]).value_counts().head(top_n_country).index\n",
+ "company_filtered = company_split.apply(lambda x: [c for c in x if c in top_company])\n",
+ "country_filtered = country_split.apply(lambda x: [c for c in x if c in top_country])\n",
+ "company_dummies = pd.DataFrame(mlb_company.fit_transform(company_filtered), columns=[f'company_{c}' for c in mlb_company.classes_], index=merged_df.index)\n",
+ "country_dummies = pd.DataFrame(mlb_country.fit_transform(country_filtered), columns=[f'country_{c}' for c in mlb_country.classes_], index=merged_df.index)\n",
+ "merged_df = pd.concat([merged_df, company_dummies, country_dummies], axis=1)"
+ ],
+ "metadata": {
+ "id": "r5p3TG_toank"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def add_target_encoding(col, target='vote_average', top_n=3):\n",
+ " values = pd.Series([v for sublist in merged_df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',') if i.strip()]) for v in sublist])\n",
+ " top_values = values.value_counts().head(top_n).index\n",
+ " for v in top_values:\n",
+ " mask = merged_df[col].str.contains(rf'\\b{v}\\b', regex=True)\n",
+ " mean_val = merged_df.loc[mask, target].mean()\n",
+ " merged_df[f'{col}_{v}_mean_{target}'] = mask.astype(int) * mean_val"
+ ],
+ "metadata": {
+ "id": "jPhDU7ZRof_e"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "add_target_encoding(col='genres')\n",
+ "add_target_encoding(col='production_companies')\n",
+ "add_target_encoding(col='status')"
+ ],
+ "metadata": {
+ "id": "VtoR9lREwx6m"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 274
+ },
+ "id": "6fc3ff2b",
+ "outputId": "c417b4ad-8a25-49fa-c5cc-cd4671ebb7e9"
+ },
+ "source": [
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "\n",
+ "# Apply TF-IDF to the 'overview' column\n",
+ "# We'll limit the number of features to manage dimensionality\n",
+ "tfidf_overview_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english') # Adjusted max_features\n",
+ "tfidf_overview_matrix = tfidf_overview_vectorizer.fit_transform(merged_df['overview'].fillna(''))\n",
+ "tfidf_overview_df = pd.DataFrame(tfidf_overview_matrix.toarray(), columns=[f'overview_tfidf_{col}' for col in tfidf_overview_vectorizer.get_feature_names_out()], index=merged_df.index)\n",
+ "\n",
+ "# Apply TF-IDF to the 'tagline' column\n",
+ "# We'll limit the number of features and can adjust this based on performance\n",
+ "# tfidf_tagline_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english')\n",
+ "# tfidf_tagline_matrix = tfidf_tagline_vectorizer.fit_transform(merged_df['tagline'].fillna(''))\n",
+ "# tfidf_tagline_df = pd.DataFrame(tfidf_tagline_matrix.toarray(), columns=[f'tagline_tfidf_{col}' for col in tfidf_tagline_vectorizer.get_feature_names_out()], index=merged_df.index)\n",
+ "\n",
+ "print(\"Shape of TF-IDF overview DataFrame:\", tfidf_overview_df.shape)\n",
+ "# print(\"Shape of TF-IDF tagline DataFrame:\", tfidf_tagline_df.shape)\n",
+ "\n",
+ "display(tfidf_overview_df.head())\n",
+ "# display(tfidf_tagline_df.head())"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of TF-IDF overview DataFrame: (98845, 2100)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " overview_tfidf_000 overview_tfidf_10 overview_tfidf_12 \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " overview_tfidf_13 overview_tfidf_15 overview_tfidf_1930s \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " overview_tfidf_1940s overview_tfidf_1950s overview_tfidf_1955 \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " overview_tfidf_1985 ... overview_tfidf_wrongfully overview_tfidf_year \\\n",
+ "0 0.0 ... 0.0 0.0 \n",
+ "1 0.0 ... 0.0 0.0 \n",
+ "2 0.0 ... 0.0 0.0 \n",
+ "3 0.0 ... 0.0 0.0 \n",
+ "4 0.0 ... 0.0 0.0 \n",
+ "\n",
+ " overview_tfidf_years overview_tfidf_yoda overview_tfidf_york \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " overview_tfidf_young overview_tfidf_younger overview_tfidf_youngest \\\n",
+ "0 0.0 0.0 0.0 \n",
+ "1 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 \n",
+ "\n",
+ " overview_tfidf_youth overview_tfidf_zion \n",
+ "0 0.0 0.0 \n",
+ "1 0.0 0.0 \n",
+ "2 0.0 0.0 \n",
+ "3 0.0 0.0 \n",
+ "4 0.0 0.0 \n",
+ "\n",
+ "[5 rows x 2100 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " overview_tfidf_000 \n",
+ " overview_tfidf_10 \n",
+ " overview_tfidf_12 \n",
+ " overview_tfidf_13 \n",
+ " overview_tfidf_15 \n",
+ " overview_tfidf_1930s \n",
+ " overview_tfidf_1940s \n",
+ " overview_tfidf_1950s \n",
+ " overview_tfidf_1955 \n",
+ " overview_tfidf_1985 \n",
+ " ... \n",
+ " overview_tfidf_wrongfully \n",
+ " overview_tfidf_year \n",
+ " overview_tfidf_years \n",
+ " overview_tfidf_yoda \n",
+ " overview_tfidf_york \n",
+ " overview_tfidf_young \n",
+ " overview_tfidf_younger \n",
+ " overview_tfidf_youngest \n",
+ " overview_tfidf_youth \n",
+ " overview_tfidf_zion \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " ... \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " ... \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " ... \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " ... \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " ... \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
5 rows × 2100 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 442
+ },
+ "id": "56b55a07",
+ "outputId": "1687a006-f922-4d1d-d8f6-b26ba06602b6"
+ },
+ "source": [
+ "# Combine the original dataframe with the TF-IDF features\n",
+ "merged_df_with_tfidf = pd.concat([merged_df, tfidf_overview_df], axis=1)\n",
+ "\n",
+ "print(\"Shape of the DataFrame after adding TF-IDF features:\", merged_df_with_tfidf.shape)\n",
+ "display(merged_df_with_tfidf.head().transpose())"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of the DataFrame after adding TF-IDF features: (98845, 2188)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " 0 1 2 3 4\n",
+ "movieId 1 1 1 1 1\n",
+ "title Toy Story Toy Story Toy Story Toy Story Toy Story\n",
+ "runtime 81.0 81.0 81.0 81.0 81.0\n",
+ "status Released Released Released Released Released\n",
+ "budget 30000000 30000000 30000000 30000000 30000000\n",
+ "... ... ... ... ... ...\n",
+ "overview_tfidf_young 0.0 0.0 0.0 0.0 0.0\n",
+ "overview_tfidf_younger 0.0 0.0 0.0 0.0 0.0\n",
+ "overview_tfidf_youngest 0.0 0.0 0.0 0.0 0.0\n",
+ "overview_tfidf_youth 0.0 0.0 0.0 0.0 0.0\n",
+ "overview_tfidf_zion 0.0 0.0 0.0 0.0 0.0\n",
+ "\n",
+ "[2188 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " title \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " Toy Story \n",
+ " \n",
+ " \n",
+ " runtime \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " 81.0 \n",
+ " \n",
+ " \n",
+ " status \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " \n",
+ " \n",
+ " budget \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " 30000000 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " overview_tfidf_young \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " overview_tfidf_younger \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " overview_tfidf_youngest \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " overview_tfidf_youth \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ " overview_tfidf_zion \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
2188 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(merged_df_with_tfidf\",\n \"rows\": 2188,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 61,\n \"samples\": [\n 1,\n 373554033.0,\n 0.1483635850356549\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 61,\n \"samples\": [\n 1,\n 373554033.0,\n 0.1483635850356549\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 2,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 61,\n \"samples\": [\n 1,\n 373554033.0,\n 0.1483635850356549\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 3,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 62,\n \"samples\": [\n 0.12186235055345666,\n 0.150109190018193,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 4,\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 61,\n \"samples\": [\n 1,\n 373554033.0,\n 0.1483635850356549\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## SVD"
+ ],
+ "metadata": {
+ "id": "MaUHem3V1SBW"
+ }
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 551
+ },
+ "id": "fe5bde81",
+ "outputId": "42e9341e-29f6-4d35-dcbb-4ea6f5d63f2e"
+ },
+ "source": [
+ "# Identify numerical and dummy columns for SVD\n",
+ "# Exclude original text columns and the target variable 'rating'\n",
+ "columns_for_svd = merged_df_with_tfidf.select_dtypes(include=np.number).columns.tolist()\n",
+ "columns_for_svd = [col for col in columns_for_svd if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year']] # Exclude non-feature columns and year\n",
+ "\n",
+ "# Check for missing values in the columns selected for SVD\n",
+ "missing_values_in_svd_cols = merged_df_with_tfidf[columns_for_svd].isnull().sum()\n",
+ "print(\"Missing values in columns selected for SVD:\")\n",
+ "display(missing_values_in_svd_cols[missing_values_in_svd_cols > 0])\n",
+ "\n",
+ "# Impute missing values in the identified columns with the median\n",
+ "for col in columns_for_svd:\n",
+ " if merged_df_with_tfidf[col].isnull().any():\n",
+ " median_val = merged_df_with_tfidf[col].median()\n",
+ " merged_df_with_tfidf[col] = merged_df_with_tfidf[col].fillna(median_val)\n",
+ "\n",
+ "# Explicitly fill missing values in the problematic column with 0\n",
+ "if 'production_companies_Warner Bros._mean_vote_average' in merged_df_with_tfidf.columns:\n",
+ " merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'] = merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'].fillna(0)\n",
+ "\n",
+ "\n",
+ "print(\"\\nMissing values in columns selected for SVD after imputation:\")\n",
+ "missing_values_after_imputation = merged_df_with_tfidf[columns_for_svd].isnull().sum()\n",
+ "display(missing_values_after_imputation[missing_values_after_imputation > 0])"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Missing values in columns selected for SVD:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "genre_Animation_mean_vote 92764\n",
+ "genre_Comedy_mean_vote 63000\n",
+ "genre_Family_mean_vote 87124\n",
+ "genre_Adventure_mean_vote 77656\n",
+ "genre_Fantasy_mean_vote 86888\n",
+ "genre_Romance_mean_vote 80177\n",
+ "genre_Drama_mean_vote 52038\n",
+ "genre_Action_mean_vote 73807\n",
+ "genre_Crime_mean_vote 81428\n",
+ "genre_Thriller_mean_vote 73511\n",
+ "production_companies_Warner Bros._mean_vote_average 98845\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " genre_Animation_mean_vote \n",
+ " 92764 \n",
+ " \n",
+ " \n",
+ " genre_Comedy_mean_vote \n",
+ " 63000 \n",
+ " \n",
+ " \n",
+ " genre_Family_mean_vote \n",
+ " 87124 \n",
+ " \n",
+ " \n",
+ " genre_Adventure_mean_vote \n",
+ " 77656 \n",
+ " \n",
+ " \n",
+ " genre_Fantasy_mean_vote \n",
+ " 86888 \n",
+ " \n",
+ " \n",
+ " genre_Romance_mean_vote \n",
+ " 80177 \n",
+ " \n",
+ " \n",
+ " genre_Drama_mean_vote \n",
+ " 52038 \n",
+ " \n",
+ " \n",
+ " genre_Action_mean_vote \n",
+ " 73807 \n",
+ " \n",
+ " \n",
+ " genre_Crime_mean_vote \n",
+ " 81428 \n",
+ " \n",
+ " \n",
+ " genre_Thriller_mean_vote \n",
+ " 73511 \n",
+ " \n",
+ " \n",
+ " production_companies_Warner Bros._mean_vote_average \n",
+ " 98845 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "Missing values in columns selected for SVD after imputation:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Series([], dtype: int64)"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 632
+ },
+ "id": "f7827ea2",
+ "outputId": "da1a1569-cc02-4588-d1fb-60f6d9d03648"
+ },
+ "source": [
+ "from sklearn.decomposition import TruncatedSVD\n",
+ "\n",
+ "# Create a DataFrame with unique movies for SVD\n",
+ "unique_movies_df = merged_df_with_tfidf.groupby('movieId').first().reset_index()\n",
+ "\n",
+ "# Identify numerical and dummy columns for SVD in the unique movies DataFrame\n",
+ "columns_for_svd_unique = unique_movies_df.select_dtypes(include=np.number).columns.tolist()\n",
+ "# Exclude non-feature columns, year, and the vote columns from SVD\n",
+ "columns_for_svd_unique = [col for col in columns_for_svd_unique if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year', 'vote_average', 'vote_count']]\n",
+ "\n",
+ "# Check for missing values in the columns selected for SVD in the unique movies DataFrame\n",
+ "missing_values_in_svd_cols_unique = unique_movies_df[columns_for_svd_unique].isnull().sum()\n",
+ "print(\"Missing values in columns selected for SVD in unique movies DataFrame:\")\n",
+ "display(missing_values_in_svd_cols_unique[missing_values_in_svd_cols_unique > 0])\n",
+ "\n",
+ "# Impute missing values in the identified columns with the median in the unique movies DataFrame\n",
+ "for col in columns_for_svd_unique:\n",
+ " if unique_movies_df[col].isnull().any():\n",
+ " median_val = unique_movies_df[col].median()\n",
+ " unique_movies_df[col] = unique_movies_df[col].fillna(median_val)\n",
+ "\n",
+ "# Explicitly fill missing values in the problematic column with 0 in the unique movies DataFrame\n",
+ "if 'production_companies_Warner Bros._mean_vote_average' in unique_movies_df.columns:\n",
+ " unique_movies_df['production_companies_Warner Bros._mean_vote_average'] = unique_movies_df['production_companies_Warner Bros._mean_vote_average'].fillna(0)\n",
+ "\n",
+ "print(\"\\nMissing values in columns selected for SVD in unique movies DataFrame after imputation:\")\n",
+ "missing_values_after_imputation_unique = unique_movies_df[columns_for_svd_unique].isnull().sum()\n",
+ "display(missing_values_after_imputation_unique[missing_values_after_imputation_unique > 0])\n",
+ "\n",
+ "\n",
+ "# Apply Truncated SVD to the unique movies DataFrame\n",
+ "n_components = 120 # You can adjust this number\n",
+ "svd = TruncatedSVD(n_components=n_components, random_state=42)\n",
+ "svd_matrix_unique = svd.fit_transform(unique_movies_df[columns_for_svd_unique])\n",
+ "\n",
+ "# Create a DataFrame from the SVD results for unique movies\n",
+ "svd_df_unique = pd.DataFrame(svd_matrix_unique, columns=[f'svd_{i+1}' for i in range(n_components)], index=unique_movies_df.index)\n",
+ "\n",
+ "# Combine the SVD features with the unique movies DataFrame (excluding the columns used for SVD)\n",
+ "columns_to_drop_after_svd_unique = [col for col in columns_for_svd_unique if col not in ['vote_average', 'vote_count']]\n",
+ "unique_movies_reduced = unique_movies_df.drop(columns=columns_to_drop_after_svd_unique).copy()\n",
+ "unique_movies_reduced = pd.concat([unique_movies_reduced, svd_df_unique], axis=1)\n",
+ "\n",
+ "\n",
+ "print(\"Shape of the unique movies DataFrame after applying Truncated SVD:\", unique_movies_reduced.shape)\n",
+ "display(unique_movies_reduced.head().transpose())"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Missing values in columns selected for SVD in unique movies DataFrame:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Series([], dtype: int64)"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "Missing values in columns selected for SVD in unique movies DataFrame after imputation:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Series([], dtype: int64)"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of the unique movies DataFrame after applying Truncated SVD: (8972, 136)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " 0 1 2 3 \\\n",
+ "movieId 1 2 3 4 \n",
+ "title Toy Story Jumanji Grumpier Old Men Waiting to Exhale \n",
+ "status Released Released Released Released \n",
+ "vote_average 7.7 6.9 6.5 6.1 \n",
+ "vote_count 5415.0 2413.0 92.0 34.0 \n",
+ "... ... ... ... ... \n",
+ "svd_116 -0.025538 -0.026685 0.023318 -0.038171 \n",
+ "svd_117 -0.01152 -0.007647 -0.002787 -0.004564 \n",
+ "svd_118 -0.018803 -0.057127 -0.016533 -0.051838 \n",
+ "svd_119 0.004499 0.009964 -0.009325 0.001978 \n",
+ "svd_120 -0.02886 -0.034346 -0.059707 -0.005955 \n",
+ "\n",
+ " 4 \n",
+ "movieId 5 \n",
+ "title Father of the Bride Part II \n",
+ "status Released \n",
+ "vote_average 5.7 \n",
+ "vote_count 173.0 \n",
+ "... ... \n",
+ "svd_116 -0.014638 \n",
+ "svd_117 -0.0166 \n",
+ "svd_118 -0.030122 \n",
+ "svd_119 0.026348 \n",
+ "svd_120 0.047427 \n",
+ "\n",
+ "[136 rows x 5 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " 1 \n",
+ " 2 \n",
+ " 3 \n",
+ " 4 \n",
+ " 5 \n",
+ " \n",
+ " \n",
+ " title \n",
+ " Toy Story \n",
+ " Jumanji \n",
+ " Grumpier Old Men \n",
+ " Waiting to Exhale \n",
+ " Father of the Bride Part II \n",
+ " \n",
+ " \n",
+ " status \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " Released \n",
+ " \n",
+ " \n",
+ " vote_average \n",
+ " 7.7 \n",
+ " 6.9 \n",
+ " 6.5 \n",
+ " 6.1 \n",
+ " 5.7 \n",
+ " \n",
+ " \n",
+ " vote_count \n",
+ " 5415.0 \n",
+ " 2413.0 \n",
+ " 92.0 \n",
+ " 34.0 \n",
+ " 173.0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " svd_116 \n",
+ " -0.025538 \n",
+ " -0.026685 \n",
+ " 0.023318 \n",
+ " -0.038171 \n",
+ " -0.014638 \n",
+ " \n",
+ " \n",
+ " svd_117 \n",
+ " -0.01152 \n",
+ " -0.007647 \n",
+ " -0.002787 \n",
+ " -0.004564 \n",
+ " -0.0166 \n",
+ " \n",
+ " \n",
+ " svd_118 \n",
+ " -0.018803 \n",
+ " -0.057127 \n",
+ " -0.016533 \n",
+ " -0.051838 \n",
+ " -0.030122 \n",
+ " \n",
+ " \n",
+ " svd_119 \n",
+ " 0.004499 \n",
+ " 0.009964 \n",
+ " -0.009325 \n",
+ " 0.001978 \n",
+ " 0.026348 \n",
+ " \n",
+ " \n",
+ " svd_120 \n",
+ " -0.02886 \n",
+ " -0.034346 \n",
+ " -0.059707 \n",
+ " -0.005955 \n",
+ " 0.047427 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
136 rows × 5 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(unique_movies_reduced\",\n \"rows\": 136,\n \"fields\": [\n {\n \"column\": 0,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 136,\n \"samples\": [\n 0.040907983782471,\n -0.019373394287205255,\n 0.03131139116848241\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 1,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 135,\n \"samples\": [\n -0.001120277504440442,\n 0.015016946715529673,\n 0.0008380668530291163\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 2,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 136,\n \"samples\": [\n 0.016547831494790517,\n 0.8309301292886759,\n 0.009506245624910233\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 3,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 136,\n \"samples\": [\n -0.011711636108982999,\n -0.3074299745453618,\n -0.011264538157491007\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": 4,\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 136,\n \"samples\": [\n -0.019882749169734447,\n -0.04996201303605531,\n -0.011359787723998309\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Modeling"
+ ],
+ "metadata": {
+ "id": "7Et4oYa61ROU"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2d846f72"
+ },
+ "source": [
+ "## Model 1: Popularity Baseline\n",
+ "\n",
+ "Implement a simple baseline that recommends the most popular movies globally using the IMDb weighted-rating (WR) formula."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 474
+ },
+ "id": "bfebb3f8",
+ "outputId": "b9876c9e-4711-43e7-ce67-8ba1a71922e9"
+ },
+ "source": [
+ "# Calculate C (mean rating across all movies)\n",
+ "C = unique_movies_reduced['vote_average'].mean()\n",
+ "print(f\"Mean rating across all movies (C): {C:.2f}\")\n",
+ "\n",
+ "# Calculate m (minimum number of votes required to be listed in the top 250)\n",
+ "# We'll use the 90th percentile as a threshold for simplicity\n",
+ "m = unique_movies_reduced['vote_count'].quantile(0.90)\n",
+ "print(f\"Minimum number of votes (m) for top list consideration (90th percentile): {m:.2f}\")\n",
+ "\n",
+ "# Filter out movies that have fewer than m votes\n",
+ "qualified_movies = unique_movies_reduced[unique_movies_reduced['vote_count'] >= m].copy()\n",
+ "print(f\"\\nNumber of movies qualified for weighted rating calculation: {qualified_movies.shape[0]}\")\n",
+ "\n",
+ "# Define the IMDb weighted rating formula\n",
+ "def weighted_rating(x, m=m, C=C):\n",
+ " v = x['vote_count']\n",
+ " R = x['vote_average']\n",
+ " # Calculation based on the IMDb formula\n",
+ " return (v / (v + m) * R) + (m / (v + m) * C)\n",
+ "\n",
+ "# Apply the weighted_rating function to the qualified movies DataFrame\n",
+ "qualified_movies['weighted_rating'] = qualified_movies.apply(weighted_rating, axis=1)\n",
+ "\n",
+ "# Sort movies by weighted rating in descending order\n",
+ "popular_movies = qualified_movies.sort_values('weighted_rating', ascending=False)\n",
+ "\n",
+ "# Group by movie ID and take the first occurrence to show unique movies\n",
+ "popular_movies_unique = popular_movies.groupby('movieId').first().reset_index()\n",
+ "\n",
+ "print(\"\\nTop 10 Popular Movies based on Weighted Rating:\")\n",
+ "display(popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(10))"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mean rating across all movies (C): 6.37\n",
+ "Minimum number of votes (m) for top list consideration (90th percentile): 1092.90\n",
+ "\n",
+ "Number of movies qualified for weighted rating calculation: 898\n",
+ "\n",
+ "Top 10 Popular Movies based on Weighted Rating:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " title vote_count vote_average weighted_rating\n",
+ "0 Toy Story 5415.0 7.7 7.475872\n",
+ "1 Jumanji 2413.0 6.9 6.733342\n",
+ "2 Heat 1886.0 7.7 7.210355\n",
+ "3 GoldenEye 1194.0 6.6 6.487877\n",
+ "4 Casino 1343.0 7.8 7.156339\n",
+ "5 Ace Ventura: When Nature Calls 1128.0 6.1 6.230593\n",
+ "6 Twelve Monkeys 2470.0 7.4 7.082636\n",
+ "7 Se7en 5915.0 8.1 7.829482\n",
+ "8 Pocahontas 1509.0 6.7 6.559447\n",
+ "9 The Usual Suspects 3334.0 8.1 7.671762"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " title \n",
+ " vote_count \n",
+ " vote_average \n",
+ " weighted_rating \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Toy Story \n",
+ " 5415.0 \n",
+ " 7.7 \n",
+ " 7.475872 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Jumanji \n",
+ " 2413.0 \n",
+ " 6.9 \n",
+ " 6.733342 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Heat \n",
+ " 1886.0 \n",
+ " 7.7 \n",
+ " 7.210355 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " GoldenEye \n",
+ " 1194.0 \n",
+ " 6.6 \n",
+ " 6.487877 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Casino \n",
+ " 1343.0 \n",
+ " 7.8 \n",
+ " 7.156339 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Ace Ventura: When Nature Calls \n",
+ " 1128.0 \n",
+ " 6.1 \n",
+ " 6.230593 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " Twelve Monkeys \n",
+ " 2470.0 \n",
+ " 7.4 \n",
+ " 7.082636 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Se7en \n",
+ " 5915.0 \n",
+ " 8.1 \n",
+ " 7.829482 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Pocahontas \n",
+ " 1509.0 \n",
+ " 6.7 \n",
+ " 6.559447 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " The Usual Suspects \n",
+ " 3334.0 \n",
+ " 8.1 \n",
+ " 7.671762 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"display(popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Pocahontas\",\n \"Jumanji\",\n \"Ace Ventura: When Nature Calls\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1728.550327104575,\n \"min\": 1128.0,\n \"max\": 5915.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 1509.0,\n 2413.0,\n 1128.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6919376979018976,\n \"min\": 6.1,\n \"max\": 8.1,\n \"num_unique_values\": 8,\n \"samples\": [\n 6.9,\n 7.4,\n 7.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"weighted_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.531410019489584,\n \"min\": 6.23059349726819,\n \"max\": 7.829481741760431,\n \"num_unique_values\": 10,\n \"samples\": [\n 6.55944698031551,\n 6.73334239370288,\n 6.23059349726819\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5fe22106"
+ },
+ "source": [
+ "## Model 2: Content-Based (CB) Recommender\n",
+ "\n",
+ "Using the item feature vectors, build user profiles. A user profile is the rating-weighted average of the vectors of the movies they have rated."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ff0ee4d4",
+ "outputId": "be1318a9-6762-4089-e2c7-1e390763ab78"
+ },
+ "source": [
+ "# Create a mapping from movieId to the index in the unique_movies_reduced for easier lookup\n",
+ "movie_id_to_index = pd.Series(unique_movies_reduced.index, index=unique_movies_reduced['movieId']).to_dict()\n",
+ "\n",
+ "# Select only the SVD features for building user profiles\n",
+ "# Exclude non-feature columns, original text columns, and the target variable 'rating'\n",
+ "svd_features = unique_movies_reduced.filter(like='svd_')\n",
+ "\n",
+ "\n",
+ "# Function to create a user profile\n",
+ "def create_user_profile(user_ratings, svd_features, movie_id_to_index):\n",
+ " \"\"\"\n",
+ " Creates a user profile vector based on the rating-weighted average of movie SVD features.\n",
+ "\n",
+ " Args:\n",
+ " user_ratings (DataFrame): DataFrame containing ratings for a single user (movieId, rating).\n",
+ " svd_features (DataFrame): DataFrame containing SVD features for all movies.\n",
+ " movie_id_to_index (dict): Mapping from movieId to the index in svd_features.\n",
+ "\n",
+ " Returns:\n",
+ " numpy.ndarray: The user profile vector.\n",
+ " \"\"\"\n",
+ " user_profile = np.zeros(svd_features.shape[1])\n",
+ " total_weight = 0\n",
+ "\n",
+ " for index, row in user_ratings.iterrows():\n",
+ " movie_id = int(row['movieId'])\n",
+ " rating = row['rating']\n",
+ "\n",
+ " # Get the index of the movie in the svd_features DataFrame\n",
+ " movie_index = movie_id_to_index.get(movie_id)\n",
+ "\n",
+ " if movie_index is not None:\n",
+ " # Get the SVD features for the movie\n",
+ " movie_features = svd_features.loc[movie_index].values\n",
+ "\n",
+ " # Add the rating-weighted features to the user profile\n",
+ " user_profile += movie_features * rating\n",
+ " total_weight += rating\n",
+ "\n",
+ " # Normalize the user profile by the total weight\n",
+ " if total_weight > 0:\n",
+ " user_profile /= total_weight\n",
+ "\n",
+ " return user_profile\n",
+ "\n",
+ "# Create user profiles for all users\n",
+ "user_profiles = {}\n",
+ "for user_id in unique_movies_reduced['userId'].unique():\n",
+ " user_ratings = unique_movies_reduced[unique_movies_reduced['userId'] == user_id][['movieId', 'rating']]\n",
+ " user_profiles[user_id] = create_user_profile(user_ratings, svd_features, movie_id_to_index)\n",
+ "\n",
+ "print(\"Created user profiles for\", len(user_profiles), \"users.\")\n",
+ "# You can inspect a sample user profile\n",
+ "# display(user_profiles[list(user_profiles.keys())[0]])"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Created user profiles for 431 users.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 801
+ },
+ "id": "c5d4950c",
+ "outputId": "7a013b9e-9292-4a52-9e65-99e9678303f4"
+ },
+ "source": [
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "\n",
+ "# Function to get content-based recommendations for a user\n",
+ "def get_content_based_recommendations(user_id, merged_df_with_tfidf, unique_movies_reduced, user_profiles, top_n=10):\n",
+ " \"\"\"\n",
+ " Generates content-based recommendations for a given user.\n",
+ "\n",
+ " Args:\n",
+ " user_id (int): The ID of the user for whom to generate recommendations.\n",
+ " merged_df_with_tfidf (DataFrame): Original DataFrame containing movie data and user ratings.\n",
+ " unique_movies_reduced (DataFrame): DataFrame with unique movies and their SVD features.\n",
+ " user_profiles (dict): Dictionary of user profile vectors.\n",
+ " top_n (int): The number of recommendations to generate.\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame: A DataFrame containing the top N recommended movies.\n",
+ " \"\"\"\n",
+ " # Handle cold-start users: If the user has no profile, fall back to popularity baseline\n",
+ " if user_id not in user_profiles or np.all(user_profiles[user_id] == 0):\n",
+ " print(f\"User {user_id} is a cold-start user. Falling back to popularity baseline.\")\n",
+ " # Assuming 'popular_movies_unique' from the popularity baseline is available\n",
+ " if 'popular_movies_unique' in globals():\n",
+ " return popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n)\n",
+ " else:\n",
+ " print(\"Popularity baseline not available. Cannot provide recommendations for cold-start user.\")\n",
+ " return pd.DataFrame()\n",
+ "\n",
+ " user_profile = user_profiles[user_id]\n",
+ "\n",
+ " # Select only the SVD features for similarity calculation from the unique movies DataFrame\n",
+ " svd_features_unique = unique_movies_reduced.filter(like='svd_')\n",
+ "\n",
+ " # Calculate cosine similarity between the user profile and all unique movie SVD features\n",
+ " # Reshape user_profile to be a 2D array as required by cosine_similarity\n",
+ " cosine_sim_unique = cosine_similarity(user_profile.reshape(1, -1), svd_features_unique)\n",
+ "\n",
+ " # Get the similarity scores for all unique movies\n",
+ " sim_scores_unique = list(enumerate(cosine_sim_unique[0]))\n",
+ "\n",
+ " # Sort unique movies based on the similarity scores\n",
+ " sim_scores_unique = sorted(sim_scores_unique, key=lambda x: x[1], reverse=True)\n",
+ "\n",
+ " # Get the list of movies the user has already rated from the original merged DataFrame\n",
+ " rated_movie_ids = merged_df_with_tfidf[merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist()\n",
+ "\n",
+ " # Get the top N most similar unique movies (excluding the movies the user has already rated)\n",
+ " recommended_movies_indices_unique = [i[0] for i in sim_scores_unique if unique_movies_reduced.iloc[i[0]]['movieId'] not in rated_movie_ids]\n",
+ "\n",
+ " # Get the top N recommended movies from the unique_movies_reduced DataFrame\n",
+ " recommended_movies_unique = unique_movies_reduced.iloc[recommended_movies_indices_unique][['title', 'vote_average', 'vote_count']].head(top_n)\n",
+ "\n",
+ " return recommended_movies_unique.reset_index(drop=True)\n",
+ "\n",
+ "# Example usage: Get recommendations for a sample user (e.g., user ID 1)\n",
+ "sample_user_id = 1\n",
+ "content_based_recommendations = get_content_based_recommendations(sample_user_id, merged_df_with_tfidf, unique_movies_reduced, user_profiles, top_n=10)\n",
+ "\n",
+ "print(f\"\\nContent-Based Recommendations for User {sample_user_id}:\")\n",
+ "display(content_based_recommendations)\n",
+ "\n",
+ "# Example usage for a cold-start user (assuming a user ID that doesn't exist in user_profiles)\n",
+ "cold_start_user_id = 9999 # Replace with a user ID not in your data\n",
+ "cold_start_recommendations = get_content_based_recommendations(cold_start_user_id, merged_df_with_tfidf, unique_movies_reduced, user_profiles, top_n=10)\n",
+ "\n",
+ "print(f\"\\nContent-Based Recommendations for Cold-Start User {cold_start_user_id}:\")\n",
+ "display(cold_start_recommendations)"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "Content-Based Recommendations for User 1:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " title vote_average vote_count\n",
+ "0 The Naked Gun: From the Files of Police Squad! 7.1 1020.0\n",
+ "1 The Boy 5.8 1133.0\n",
+ "2 City of Angels 6.5 537.0\n",
+ "3 Conan the Barbarian 6.6 663.0\n",
+ "4 Smokin' Aces 6.4 541.0\n",
+ "5 The American 5.8 488.0\n",
+ "6 The Crow 7.3 980.0\n",
+ "7 Once Upon a Time in Mexico 6.2 605.0\n",
+ "8 The Way Way Back 7.1 695.0\n",
+ "9 Maid in Manhattan 5.6 493.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " title \n",
+ " vote_average \n",
+ " vote_count \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " The Naked Gun: From the Files of Police Squad! \n",
+ " 7.1 \n",
+ " 1020.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " The Boy \n",
+ " 5.8 \n",
+ " 1133.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " City of Angels \n",
+ " 6.5 \n",
+ " 537.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Conan the Barbarian \n",
+ " 6.6 \n",
+ " 663.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Smokin' Aces \n",
+ " 6.4 \n",
+ " 541.0 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " The American \n",
+ " 5.8 \n",
+ " 488.0 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " The Crow \n",
+ " 7.3 \n",
+ " 980.0 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Once Upon a Time in Mexico \n",
+ " 6.2 \n",
+ " 605.0 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " The Way Way Back \n",
+ " 7.1 \n",
+ " 695.0 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Maid in Manhattan \n",
+ " 5.6 \n",
+ " 493.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "content_based_recommendations",
+ "summary": "{\n \"name\": \"content_based_recommendations\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"The Way Way Back\",\n \"The Boy\",\n \"The American\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.598516684999025,\n \"min\": 5.6,\n \"max\": 7.3,\n \"num_unique_values\": 8,\n \"samples\": [\n 5.8,\n 7.3,\n 7.1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 239.4318506613335,\n \"min\": 488.0,\n \"max\": 1133.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 695.0,\n 1133.0,\n 488.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "User 9999 is a cold-start user. Falling back to popularity baseline.\n",
+ "\n",
+ "Content-Based Recommendations for Cold-Start User 9999:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " title vote_count vote_average weighted_rating\n",
+ "0 Toy Story 5415.0 7.7 7.475872\n",
+ "1 Jumanji 2413.0 6.9 6.733342\n",
+ "2 Heat 1886.0 7.7 7.210355\n",
+ "3 GoldenEye 1194.0 6.6 6.487877\n",
+ "4 Casino 1343.0 7.8 7.156339\n",
+ "5 Ace Ventura: When Nature Calls 1128.0 6.1 6.230593\n",
+ "6 Twelve Monkeys 2470.0 7.4 7.082636\n",
+ "7 Se7en 5915.0 8.1 7.829482\n",
+ "8 Pocahontas 1509.0 6.7 6.559447\n",
+ "9 The Usual Suspects 3334.0 8.1 7.671762"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " title \n",
+ " vote_count \n",
+ " vote_average \n",
+ " weighted_rating \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Toy Story \n",
+ " 5415.0 \n",
+ " 7.7 \n",
+ " 7.475872 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Jumanji \n",
+ " 2413.0 \n",
+ " 6.9 \n",
+ " 6.733342 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Heat \n",
+ " 1886.0 \n",
+ " 7.7 \n",
+ " 7.210355 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " GoldenEye \n",
+ " 1194.0 \n",
+ " 6.6 \n",
+ " 6.487877 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Casino \n",
+ " 1343.0 \n",
+ " 7.8 \n",
+ " 7.156339 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Ace Ventura: When Nature Calls \n",
+ " 1128.0 \n",
+ " 6.1 \n",
+ " 6.230593 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " Twelve Monkeys \n",
+ " 2470.0 \n",
+ " 7.4 \n",
+ " 7.082636 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Se7en \n",
+ " 5915.0 \n",
+ " 8.1 \n",
+ " 7.829482 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Pocahontas \n",
+ " 1509.0 \n",
+ " 6.7 \n",
+ " 6.559447 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " The Usual Suspects \n",
+ " 3334.0 \n",
+ " 8.1 \n",
+ " 7.671762 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "cold_start_recommendations",
+ "summary": "{\n \"name\": \"cold_start_recommendations\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Pocahontas\",\n \"Jumanji\",\n \"Ace Ventura: When Nature Calls\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1728.550327104575,\n \"min\": 1128.0,\n \"max\": 5915.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 1509.0,\n 2413.0,\n 1128.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6919376979018976,\n \"min\": 6.1,\n \"max\": 8.1,\n \"num_unique_values\": 8,\n \"samples\": [\n 6.9,\n 7.4,\n 7.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"weighted_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.531410019489584,\n \"min\": 6.23059349726819,\n \"max\": 7.829481741760431,\n \"num_unique_values\": 10,\n \"samples\": [\n 6.55944698031551,\n 6.73334239370288,\n 6.23059349726819\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "173dc1ae"
+ },
+ "source": [
+ "## Model 3: Collaborative Filtering (CF) Recommenders\n",
+ "\n",
+ "Implement two CF approaches:\n",
+ "\n",
+ "A neighbourhood-based model, such as user-user or item-item k-Nearest Neighbors (k-NN) with cosine or Pearson similarity."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "d9de38bd",
+ "outputId": "87337cfc-44ab-4a49-c38e-bd5554fe274c"
+ },
+ "source": [
+ "from surprise import Dataset, Reader, KNNBasic\n",
+ "from surprise.model_selection import train_test_split\n",
+ "from surprise import accuracy\n",
+ "\n",
+ "# Load the ratings data into Surprise's format\n",
+ "reader = Reader(rating_scale=(1, 5))\n",
+ "data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)\n",
+ "\n",
+ "# Split the data into training and test sets\n",
+ "trainset, testset = train_test_split(data, test_size=0.2, random_state=42)\n",
+ "\n",
+ "# Implement a user-user k-NN model\n",
+ "# You can experiment with different similarity metrics and k values\n",
+ "knn_user_based = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40)\n",
+ "\n",
+ "# Train the model\n",
+ "knn_user_based.fit(trainset)\n",
+ "\n",
+ "# Make predictions on the test set\n",
+ "predictions = knn_user_based.test(testset)\n",
+ "\n",
+ "# Evaluate the model\n",
+ "rmse = accuracy.rmse(predictions)\n",
+ "print(f\"User-based k-NN RMSE: {rmse}\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Computing the msd similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "RMSE: 0.9663\n",
+ "User-based k-NN RMSE: 0.9662515187787728\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "46fbe815",
+ "outputId": "b705a277-4d20-4fbb-e75b-41eab57826cf"
+ },
+ "source": [
+ "from surprise import SVD\n",
+ "\n",
+ "# Implement a Matrix Factorization model using SVD\n",
+ "# You can experiment with different hyperparameters (n_factors, n_epochs, lr_all, reg_all)\n",
+ "svd_mf = SVD(random_state=42)\n",
+ "\n",
+ "# Train the model on the training set\n",
+ "svd_mf.fit(trainset)\n",
+ "\n",
+ "# Make predictions on the test set\n",
+ "predictions_mf = svd_mf.test(testset)\n",
+ "\n",
+ "# Evaluate the model\n",
+ "rmse_mf = accuracy.rmse(predictions_mf)\n",
+ "print(f\"Matrix Factorization (SVD) RMSE: {rmse_mf}\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "RMSE: 0.9023\n",
+ "Matrix Factorization (SVD) RMSE: 0.9023287246946667\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ee59054c"
+ },
+ "source": [
+ "## Model 4: Hybrid Recommender\n",
+ "\n",
+ "Combine the predictions from your CB and CF models.\n",
+ "\n",
+ "The simplest way is a weighted blend: $s_{hyb} = \\alpha \\cdot s_{CF} + (1-\\alpha) \\cdot s_{CB}$.\n",
+ "\n",
+ "You must tune the blending weight $\\alpha$ on a validation set to find the optimal mix."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "59b15942",
+ "outputId": "e6495e0b-8fa3-43cb-9bdb-da4127499e9e"
+ },
+ "source": [
+ "# To create a hybrid model, we first need a way to get predictions from our existing models.\n",
+ "# For the Content-Based model, we have the get_content_based_recommendations function,\n",
+ "# but for blending, we need predicted scores for specific user-movie pairs, not just top N recommendations.\n",
+ "# We'll need to adapt the content-based approach slightly to get similarity scores for any user-movie pair.\n",
+ "\n",
+ "# For the Collaborative Filtering models (k-NN and SVD), the surprise library provides a predict method.\n",
+ "\n",
+ "# Let's first create a function to get the content-based similarity score for a specific user and movie.\n",
+ "\n",
+ "def get_content_based_score(user_id, movie_id, merged_df_with_tfidf, unique_movies_reduced, user_profiles):\n",
+ " \"\"\"\n",
+ " Calculates the content-based similarity score between a user's profile and a specific movie.\n",
+ "\n",
+ " Args:\n",
+ " user_id (int): The ID of the user.\n",
+ " movie_id (int): The ID of the movie.\n",
+ " merged_df_with_tfidf (DataFrame): Original DataFrame with movie data and user ratings.\n",
+ " unique_movies_reduced (DataFrame): DataFrame with unique movies and their SVD features.\n",
+ " user_profiles (dict): Dictionary of user profile vectors.\n",
+ "\n",
+ " Returns:\n",
+ " float: The content-based similarity score, or 0 if the user or movie is not found or is a cold-start user.\n",
+ " \"\"\"\n",
+ " # Handle cold-start users\n",
+ " if user_id not in user_profiles or np.all(user_profiles[user_id] == 0):\n",
+ " return 0.0 # Or some other default for cold-start in blending\n",
+ "\n",
+ " user_profile = user_profiles[user_id]\n",
+ "\n",
+ " # Select only the SVD features for the specific movie\n",
+ " movie_index_unique = unique_movies_reduced[unique_movies_reduced['movieId'] == movie_id].index\n",
+ " if movie_index_unique.empty:\n",
+ " return 0.0 # Movie not in our unique features\n",
+ "\n",
+ " movie_features_unique = unique_movies_reduced.loc[movie_index_unique].filter(like='svd_').values\n",
+ "\n",
+ " # Calculate cosine similarity\n",
+ " # Reshape user_profile and movie_features_unique to be 2D arrays\n",
+ " cosine_sim_score = cosine_similarity(user_profile.reshape(1, -1), movie_features_unique.reshape(1, -1))[0][0]\n",
+ "\n",
+ " return cosine_sim_score\n",
+ "\n",
+ "# Now, let's create a function to blend the predictions\n",
+ "\n",
+ "def hybrid_prediction(user_id, movie_id, alpha, cb_model, cf_model1, cf_model2, merged_df_with_tfidf, unique_movies_reduced, user_profiles):\n",
+ " \"\"\"\n",
+ " Calculates a hybrid prediction by blending content-based and collaborative filtering scores.\n",
+ "\n",
+ " Args:\n",
+ " user_id (int): The ID of the user.\n",
+ " movie_id (int): The ID of the movie.\n",
+ " alpha (float): The blending weight (0 <= alpha <= 1).\n",
+ " cb_model (function): Function to get content-based score (get_content_based_score).\n",
+ " cf_model1 (surprise.prediction_algorithms.algo_base.AlgoBase): Trained Surprise CF model 1 (e.g., k-NN).\n",
+ " cf_model2 (surprise.prediction_algorithms.algo_base.AlgoBase): Trained Surprise CF model 2 (e.g., SVD).\n",
+ " merged_df_with_tfidf (DataFrame): Original DataFrame with movie data and user ratings.\n",
+ " unique_movies_reduced (DataFrame): DataFrame with unique movies and their SVD features.\n",
+ " user_profiles (dict): Dictionary of user profile vectors.\n",
+ "\n",
+ "\n",
+ " Returns:\n",
+ " float: The hybrid predicted rating.\n",
+ " \"\"\"\n",
+ " # Get content-based score\n",
+ " cb_score = cb_model(user_id, movie_id, merged_df_with_tfidf, unique_movies_reduced, user_profiles)\n",
+ "\n",
+ " # Get collaborative filtering predictions\n",
+ " # Surprise predict method takes inner user/item ids. We need to map external ids.\n",
+ " # For simplicity here, we'll assume the movie_id is the external id.\n",
+ " # We need to get the inner ids from the trainset if we want to use predict directly.\n",
+ " # A simpler approach for blending is to use the predicted ratings from the testset if available,\n",
+ " # or predict for specific user-movie pairs using the trained models.\n",
+ "\n",
+ " # Let's use the predict method from Surprise models for specific user-movie pairs\n",
+ " # We need to handle cases where a user/movie might not be in the training set for CF models\n",
+ " try:\n",
+ " cf1_pred = cf_model1.predict(str(user_id), str(movie_id)).est # Use .est to get the estimated rating\n",
+ " except Exception as e:\n",
+ " # Handle cases where user/movie is not in the training set for CF model 1\n",
+ " cf1_pred = 0 # Or a neutral rating like the global average\n",
+ "\n",
+ " try:\n",
+ " cf2_pred = cf_model2.predict(str(user_id), str(movie_id)).est # Use .est to get the estimated rating\n",
+ " except Exception as e:\n",
+ " # Handle cases where user/movie is not in the training set for CF model 2\n",
+ " cf2_pred = 0 # Or a neutral rating like the global average\n",
+ "\n",
+ " # Simple average of the two CF models for the CF component of the hybrid\n",
+ " cf_score = (cf1_pred + cf2_pred) / 2.0\n",
+ "\n",
+ " # Blend the scores\n",
+ " hybrid_score = alpha * cf_score + (1 - alpha) * cb_score\n",
+ "\n",
+ " return hybrid_score\n",
+ "\n",
+ "# Example of getting a hybrid prediction for a user and movie (replace with actual user_id and movie_id)\n",
+ "# You would typically do this for user-movie pairs you want to predict ratings for.\n",
+ "sample_user_id_for_hybrid = 1\n",
+ "sample_movie_id_for_hybrid = 50 # Example movie ID\n",
+ "\n",
+ "# We need the trained CF models (knn_user_based and svd_mf) from previous steps\n",
+ "# and the user_profiles, merged_df_with_tfidf, unique_movies_reduced dataframes.\n",
+ "\n",
+ "# Example prediction (using a placeholder alpha value for now)\n",
+ "alpha_example = 0.5\n",
+ "predicted_rating = hybrid_prediction(\n",
+ " sample_user_id_for_hybrid,\n",
+ " sample_movie_id_for_hybrid,\n",
+ " alpha_example,\n",
+ " get_content_based_score,\n",
+ " knn_user_based,\n",
+ " svd_mf,\n",
+ " merged_df_with_tfidf,\n",
+ " unique_movies_reduced,\n",
+ " user_profiles\n",
+ ")\n",
+ "\n",
+ "print(f\"\\nHybrid predicted rating for User {sample_user_id_for_hybrid} and Movie {sample_movie_id_for_hybrid} (with alpha={alpha_example}): {predicted_rating:.2f}\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "Hybrid predicted rating for User 1 and Movie 50 (with alpha=0.5): 2.27\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 261
+ },
+ "id": "7fc18bfe",
+ "outputId": "b4215d7f-7509-453e-f31c-629fe74fc5b6"
+ },
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "from surprise.model_selection import train_test_split as surprise_train_test_split\n",
+ "\n",
+ "# Split the merged data for the Content-Based model (if needed for evaluating CB separately)\n",
+ "# However, for tuning the hybrid, we mainly need the ratings split for CF and for filtering seen movies.\n",
+ "# We will use the Surprise trainset and testset (which we'll treat as validation set for tuning alpha)\n",
+ "# from the previous CF step. Let's ensure we have access to those.\n",
+ "\n",
+ "# Assuming 'data' (Surprise Dataset) and 'trainset', 'testset' (Surprise train/test splits)\n",
+ "# are available from the previous CF implementation steps.\n",
+ "\n",
+ "# For blending, we need a way to get actual ratings for the user-movie pairs in the testset (validation set).\n",
+ "# We can convert the Surprise testset back to a pandas DataFrame for easier merging and lookup.\n",
+ "testset_df = pd.DataFrame(testset, columns=['userId', 'movieId', 'rating'])\n",
+ "\n",
+ "print(\"Shape of the testset DataFrame (used for alpha tuning):\", testset_df.shape)\n",
+ "display(testset_df.head())\n",
+ "\n",
+ "# We already have the trained CF models (knn_user_based and svd_mf) from the previous steps,\n",
+ "# which were trained on the 'trainset'.\n",
+ "\n",
+ "# For the Content-Based model, the user profiles ('user_profiles') were created using the\n",
+ "# entire merged_df_with_tfidf. For a proper evaluation and tuning, we should ideally\n",
+ "# recreate user profiles using only the training data ratings.\n",
+ "\n",
+ "# Let's redefine the create_user_profile function to work with a specific set of ratings (e.g., the training set ratings)\n",
+ "def create_user_profile_from_ratings(user_ratings_df, unique_movies_reduced, movie_id_to_index):\n",
+ " \"\"\"\n",
+ " Creates a user profile vector based on the rating-weighted average of movie SVD features,\n",
+ " using ratings from a specified DataFrame.\n",
+ "\n",
+ " Args:\n",
+ " user_ratings_df (DataFrame): DataFrame containing ratings for multiple users (userId, movieId, rating).\n",
+ " unique_movies_reduced (DataFrame): DataFrame with unique movies and their SVD features.\n",
+ " movie_id_to_index (dict): Mapping from movieId to the index in unique_movies_reduced.\n",
+ "\n",
+ " Returns:\n",
+ " dict: Dictionary of user profile vectors, keyed by userId.\n",
+ " \"\"\"\n",
+ " user_profiles = {}\n",
+ " svd_features = unique_movies_reduced.filter(like='svd_')\n",
+ "\n",
+ " for user_id in user_ratings_df['userId'].unique():\n",
+ " single_user_ratings = user_ratings_df[user_ratings_df['userId'] == user_id]\n",
+ " user_profile = np.zeros(svd_features.shape[1])\n",
+ " total_weight = 0\n",
+ "\n",
+ " for index, row in single_user_ratings.iterrows():\n",
+ " movie_id = int(row['movieId'])\n",
+ " rating = row['rating']\n",
+ "\n",
+ " movie_index = movie_id_to_index.get(movie_id)\n",
+ "\n",
+ " if movie_index is not None:\n",
+ " movie_features = svd_features.loc[movie_index].values\n",
+ " user_profile += movie_features * rating\n",
+ " total_weight += rating\n",
+ "\n",
+ " if total_weight > 0:\n",
+ " user_profile /= total_weight\n",
+ " user_profiles[user_id] = user_profile\n",
+ "\n",
+ " return user_profiles\n",
+ "\n",
+ "# Get training ratings from the Surprise trainset\n",
+ "train_ratings_df = pd.DataFrame(trainset.all_ratings(), columns=['uid', 'iid', 'rating'])\n",
+ "# Map inner IDs back to external IDs (userId and movieId)\n",
+ "train_ratings_df['userId'] = train_ratings_df['uid'].apply(lambda x: trainset.to_raw_uid(x))\n",
+ "train_ratings_df['movieId'] = train_ratings_df['iid'].apply(lambda x: trainset.to_raw_iid(x))\n",
+ "train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']]\n",
+ "\n",
+ "# Recreate user profiles using only the training set ratings\n",
+ "# We need the movie_id_to_index mapping which was based on unique_movies_reduced\n",
+ "movie_id_to_index = pd.Series(unique_movies_reduced.index, index=unique_movies_reduced['movieId']).to_dict()\n",
+ "\n",
+ "user_profiles_train = create_user_profile_from_ratings(train_ratings_df, unique_movies_reduced, movie_id_to_index)\n",
+ "\n",
+ "print(f\"\\nCreated user profiles using training data for {len(user_profiles_train)} users.\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shape of the testset DataFrame (used for alpha tuning): (20001, 3)\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " userId movieId rating\n",
+ "0 15 95875 1.5\n",
+ "1 664 3081 4.0\n",
+ "2 171 2770 4.0\n",
+ "3 355 589 4.0\n",
+ "4 505 86882 5.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " userId \n",
+ " movieId \n",
+ " rating \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 15 \n",
+ " 95875 \n",
+ " 1.5 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 664 \n",
+ " 3081 \n",
+ " 4.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 171 \n",
+ " 2770 \n",
+ " 4.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 355 \n",
+ " 589 \n",
+ " 4.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 505 \n",
+ " 86882 \n",
+ " 5.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "summary": "{\n \"name\": \"print(f\\\"\\\\nCreated user profiles using training data for {len(user_profiles_train)} users\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"userId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 258,\n \"min\": 15,\n \"max\": 664,\n \"num_unique_values\": 5,\n \"samples\": [\n 664,\n 505,\n 171\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"movieId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 48987,\n \"min\": 589,\n \"max\": 95875,\n \"num_unique_values\": 5,\n \"samples\": [\n 3081,\n 86882,\n 2770\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.3038404810405297,\n \"min\": 1.5,\n \"max\": 5.0,\n \"num_unique_values\": 3,\n \"samples\": [\n 1.5,\n 4.0,\n 5.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "Created user profiles using training data for 671 users.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "b79273f9",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 749
+ },
+ "outputId": "8700dc1c-0b91-4934-e169-58e7c0c71a40"
+ },
+ "source": [
+ "from sklearn.metrics import mean_squared_error\n",
+ "\n",
+ "# Define a range of alpha values to test\n",
+ "alpha_values = np.arange(0, 1.01, 0.5) # 0, 0.5, 1.0\n",
+ "\n",
+ "# Dictionary to store RMSE for each alpha\n",
+ "rmse_scores = {}\n",
+ "\n",
+ "# Iterate through alpha values and evaluate the hybrid model\n",
+ "for alpha in alpha_values:\n",
+ " print(f\"Evaluating hybrid model with alpha = {alpha:.2f}\")\n",
+ " predictions_hybrid = []\n",
+ " actual_ratings = []\n",
+ "\n",
+ " # Calculate hybrid predictions for each user-movie pair in the testset_df (validation set)\n",
+ " for index, row in testset_df.iterrows():\n",
+ " user_id = int(row['userId'])\n",
+ " movie_id = int(row['movieId'])\n",
+ " actual_rating = row['rating']\n",
+ "\n",
+ " # Get the hybrid predicted rating\n",
+ " predicted_rating = hybrid_prediction(\n",
+ " user_id,\n",
+ " movie_id,\n",
+ " alpha,\n",
+ " get_content_based_score,\n",
+ " knn_user_based, # Trained k-NN model\n",
+ " svd_mf, # Trained SVD model\n",
+ " merged_df_with_tfidf, # Original merged data for filtering seen movies (used by get_content_based_score)\n",
+ " unique_movies_reduced, # Unique movies with SVD features (used by get_content_based_score)\n",
+ " user_profiles_train # User profiles created from training data\n",
+ " )\n",
+ "\n",
+ " predictions_hybrid.append(predicted_rating)\n",
+ " actual_ratings.append(actual_rating)\n",
+ "\n",
+ " # Calculate RMSE for the current alpha\n",
+ " rmse = np.sqrt(mean_squared_error(actual_ratings, predictions_hybrid))\n",
+ " rmse_scores[alpha] = rmse\n",
+ " print(f\"RMSE for alpha = {alpha:.2f}: {rmse:.4f}\\n\")\n",
+ "\n",
+ "# Find the best alpha value (the one with the lowest RMSE)\n",
+ "best_alpha = min(rmse_scores, key=rmse_scores.get)\n",
+ "min_rmse = rmse_scores[best_alpha]\n",
+ "\n",
+ "print(f\"Best alpha value: {best_alpha:.2f} with RMSE: {min_rmse:.4f}\")\n",
+ "\n",
+ "# Optional: Plot RMSE vs. alpha to visualize the tuning process\n",
+ "plt.figure(figsize=(10, 6))\n",
+ "plt.plot(list(rmse_scores.keys()), list(rmse_scores.values()), marker='o')\n",
+ "plt.xlabel(\"Alpha (Weight for CF)\")\n",
+ "plt.ylabel(\"RMSE on Validation Set\")\n",
+ "plt.title(\"Hybrid Model RMSE vs. Alpha\")\n",
+ "plt.grid(True)\n",
+ "plt.show()"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Evaluating hybrid model with alpha = 0.00\n",
+ "RMSE for alpha = 0.00: 2.9537\n",
+ "\n",
+ "Evaluating hybrid model with alpha = 0.50\n",
+ "RMSE for alpha = 0.50: 1.7347\n",
+ "\n",
+ "Evaluating hybrid model with alpha = 1.00\n",
+ "RMSE for alpha = 1.00: 1.0638\n",
+ "\n",
+ "Best alpha value: 1.00 with RMSE: 1.0638\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1cAAAIjCAYAAADvBuGTAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAmDFJREFUeJzs3WdYFdf+9vHvpoNSRKUpxd57xd57TWJNLKnGWJKYcqJJLDEnppkYS8xJs6PGJJbYe0Gx914BC2AFBBQp+3mRR/6Hgxq2gkO5P9e1rzgza2bugWXkx6xZYzKbzWZERERERETkiVgZHUBERERERCQvUHElIiIiIiKSBVRciYiIiIiIZAEVVyIiIiIiIllAxZWIiIiIiEgWUHElIiIiIiKSBVRciYiIiIiIZAEVVyIiIiIiIllAxZWIiIiIiEgWUHElIpIDhYaGYjKZ+Prrr5/oOAMHDiQgICDT55s5c+YTnS87BQQEMHDgwMfa12QyMXbs2CzNI09u7NixmEymJ9r3+vXrWZxKROTxqbgSEXkCM2fOxGQysXfv3gdub9asGZUrV37KqbLP5s2bMZlMmEwm5s6d+8A2DRs2xGQy5brrvl9g3v9YWVnh7u5O+/btCQkJydD+/g/3VlZWXLx4McP22NhYHB0dMZlMDB06NN22a9eu8eabb1K+fHkcHR3x8PCgbt26/Otf/yIuLi6t3cCBA9Nl+u+Pg4ND1n8RslBKSgo+Pj6YTCZWrVpldBwRkafCxugAIiKSfX766SdSU1Oz/LgODg4EBQXxwgsvpFsfGhrKjh07cvwP/o/Sp08fOnToQEpKCqdPn+b777+nefPm7NmzhypVqmRob29vz/z583n//ffTrf/zzz8fePybN29Su3ZtYmNjeemllyhfvjw3btzg8OHDTJ8+ncGDB1OwYMF0x//5558zHMfa2voJrzR7bdy4kYiICAICApg3bx7t27c3OpKISLZTcSUikgfFx8dToEABbG1ts+X4HTp0YNmyZVy/fp0iRYqkrQ8KCsLT05MyZcpw69atbDl3dqtZs2a6orFx48a0b9+e6dOn8/3332do36FDhwcWV0FBQXTs2JE//vgj3fpffvmF8PBwtm/fToMGDdJti42Nxc7OLt06GxubDEVsbjB37lxq1qzJgAEDGDVqVFqfFBHJyzQsUETkKWratCnVqlV74LZy5crRtm3bDOu//fZb/P39cXR0pGnTphw9ejTd9oEDB1KwYEHOnTtHhw4dcHZ25vnnn0/b9r/PXEVHRzNw4EBcXV1xc3NjwIABREdHW3QdXbt2xd7enkWLFqVbHxQURM+ePR94VyU5OZnx48dTqlQp7O3tCQgIYNSoUSQmJqZrZzab+fTTTylevDhOTk40b96cY8eOPTBHdHQ0b731Fr6+vtjb21O6dGm++OKLLL1b17hxYwDOnTv3wO19+/bl4MGDnDx5Mm1dZGQkGzdupG/fvhnanzt3Dmtra+rXr59hm4uLS5bc9UtKSsLd3Z0XX3wxw7bY2FgcHBx4991309ZNmTKFSpUq4eTkRKFChahduzZBQUGPff47d+6wePFievfuTc+ePblz5w5Lly7N1L73h1HOmzePcuXK4eDgQK1atdi6desD29/vz25ubri6uvLiiy+SkJCQrs2MGTNo0aIFHh4e2NvbU7FiRaZPn/7Y1yci8jAqrkREskBMTAzXr1/P8ElKSkrXrl+/fhw+fDhDgbRnzx5Onz6d4Q7F7NmzmTx5MkOGDGHkyJEcPXqUFi1aEBUVla5dcnIybdu2xcPDg6+//ppnn332gTnNZjNdu3Zlzpw5vPDCC3z66adcunSJAQMGWHS9Tk5OdO3alfnz56etO3ToEMeOHXtgQQHwyiuvMHr0aGrWrMm3335L06ZNmTBhAr17907XbvTo0Xz88cdUq1aNr776ipIlS9KmTRvi4+PTtUtISKBp06bMnTuX/v37M3nyZBo2bMjIkSMZMWKERdfzKKGhoQAUKlTogdubNGlC8eLF0xUjCxcupGDBgnTs2DFDe39/f1JSUpgzZ06mMzyob8XGxj60va2tLd27d2fJkiXcu3cv3bYlS5aQmJiY9nX/6aefGD58OBUrVmTSpEmMGzeO6tWrs2vXrkzn+1/Lli0jLi6O3r174+XlRbNmzZg3b16m99+yZQtvvfUWL7zwAp988gk3btygXbt2Gf7eAPTs2ZPbt28zYcIEevbsycyZMxk3bly6NtOnT8ff359Ro0YxceJEfH19eeONN5g2bdpjX6OIyAOZRUTksc2YMcMMPPJTqVKltPbR0dFmBwcH87/+9a90xxk+fLi5QIEC5ri4OLPZbDZfuHDBDJgdHR3Nly5dSmu3a9cuM2B+++2309YNGDDADJg/+OCDDPkGDBhg9vf3T1tesmSJGTB/+eWXaeuSk5PNjRs3NgPmGTNmPPJ6N23aZAbMixYtMi9fvtxsMpnM4eHhZrPZbH7vvffMJUuWNJvNZnPTpk3TXffBgwfNgPmVV15Jd7x3333XDJg3btxoNpvN5qtXr5rt7OzMHTt2NKempqa1GzVqlBkwDxgwIG3d+PHjzQUKFDCfPn063TE/+OADs7W1dVous9lsBsxjxox55LXd/5qPGzfOfO3aNXNkZKR527Zt5jp16qRd838bM2aMGTBfu3bN/O6775pLly6dtq1OnTrmF198Me3cQ4YMSdsWGRlpLlq0qBkwly9f3vz666+bg4KCzNHR0Rky3f/ePujTtm3bR17PmjVrzID5r7/+Sre+Q4cOad8ns9ls7tq1a7rvVVbo1KmTuWHDhmnLP/74o9nGxsZ89erVdO3ufw3/2/3r27t3b9q6sLAws4ODg7l79+4Z9n3ppZfS7d+9e3dz4cKF061LSEjIkLFt27bpvg4iIllBd65ERLLAtGnTWLduXYZP1apV07VzdXVNu+NjNpuBv2dVW7hwId26dcvwTEq3bt0oVqxY2nLdunWpV68eK1euzJBh8ODB/5hz5cqV2NjYpGtrbW3NsGHDLLpegDZt2uDu7s6CBQswm80sWLCAPn36PPS8QIY7Su+88w4AK1asAGD9+vXcu3ePYcOGpZui+6233spwzEWLFtG4cWMKFSqU7o5Oq1atSElJeegwsn8yZswYihYtipeXF40bN+bEiRNMnDiR55577qH79O3bl7Nnz7Jnz560/z7sDp6npyeHDh3i9ddf59atW/zwww/07dsXDw8Pxo8fn9Yv7nNwcHhg3/r8888feR0tWrSgSJEiLFy4MG3drVu3WLduHb169Upb5+bmxqVLl9izZ09mvjz/6MaNG6xZsyZdX3j22WcxmUz89ttvmTpGYGAgtWrVSlv28/Oja9eurFmzhpSUlHRtX3/99XTLjRs35saNG+nu7Dk6Oqb9+f5d5qZNm3L+/HliYmIsuj4RkUfRhBYiIlmgbt261K5dO8P6+z/4/7f+/fuzcOFCtm3bRpMmTVi/fj1RUVH069cvw/5lypTJsK5s2bIZfki1sbGhePHi/5gzLCwMb2/vdLPRwd/Pe1nK1taWHj16EBQURN26dbl48eJDC4qwsDCsrKwoXbp0uvVeXl64ubkRFhaW1g4yXnfRokUzDMs7c+YMhw8fpmjRog8859WrVy2+JoDXXnuNHj16cPfuXTZu3MjkyZMz/ED/v2rUqEH58uUJCgrCzc0NLy8vWrRo8dD23t7eaRNknDlzhjVr1vDFF18wevRovL29eeWVV9LaWltb06pVK4uvw8bGhmeffZagoCASExOxt7fnzz//JCkpKV1x9a9//Yv169dTt25dSpcuTZs2bejbty8NGza0+Jzw95DIpKQkatSowdmzZ9PW16tXj3nz5jFkyJB/PMbD+n1CQgLXrl3Dy8srbb2fn1+6dvf7ya1bt3BxcQFg+/btjBkzhpCQkAzPY8XExODq6pr5CxQReQQVVyIiT1nbtm3x9PRk7ty5NGnShLlz5+Ll5fVYP0DfZ29vj5XV0x+M0LdvX3744QfGjh1LtWrVqFix4iPbP+4LYx8kNTWV1q1bZ5il776yZcs+1nHLlCmT9r3o1KkT1tbWfPDBBzRv3vyBBfR9ffv2Zfr06Tg7O9OrV69MfT9MJhNly5albNmydOzYkTJlyjBv3rx0xdWT6N27N//5z39YtWoV3bp147fffqN8+fLpJlWpUKECp06dYvny5axevZo//viD77//ntGjR2d4dikz7j9b9bDi7Pz585QsWfLxLugBHjYl/f07gOfOnaNly5aUL1+eb775Bl9fX+zs7Fi5ciXffvtttryqQETyLw0LFBF5yqytrenbty+///47t27dYsmSJfTp0+eBPySeOXMmw7rTp09nmAEws/z9/YmIiEj3olqAU6dOPdbxGjVqhJ+fH5s3b37oXav7501NTc1wPVFRUURHR+Pv75/WDjJe97Vr1zJM7V6qVCni4uJo1arVAz//e0fjcX344Yc4Ozvz0UcfPbJd3759iYiI4PTp04/8WjxMyZIlKVSoEBEREY8bNYMmTZrg7e3NwoULuX79Ohs3bkx31+q+AgUK0KtXL2bMmEF4eDgdO3bk3//+N3fv3rXofBcuXGDHjh0MHTqURYsWpfssXLgQOzu7TM1C+LB+7+Tk9NA7lQ/z119/kZiYyLJlyxg0aBAdOnSgVatW6YYKiohkFRVXIiIG6NevH7du3WLQoEHExcU99D1GS5Ys4fLly2nLu3fvZteuXY/9QtYOHTqQnJycbhrqlJQUpkyZ8ljHM5lMTJ48mTFjxjxwWON/nxdg0qRJ6dZ/8803AGmz6rVq1QpbW1umTJmS7tmj/90P/p4lLiQkhDVr1mTYFh0dTXJysqWX80Bubm4MGjSINWvWcPDgwYe2K1WqFJMmTWLChAnUrVv3oe127dqVYeZD+Pt7e+PGjccaovkwVlZWPPfcc/z111/MmTOH5OTkDMXVjRs30i3b2dlRsWJFzGZz2myXCQkJnDx5MsMQ1/91/67V+++/z3PPPZfu07NnT5o2bZqpWQNDQkLYv39/2vLFixdZunQpbdq0sfjlyffb/3d/iomJYcaMGRYdR0QkMzQsUETEADVq1KBy5cosWrSIChUqULNmzQe2K126NI0aNWLw4MEkJiYyadIkChcu/NChcP+kc+fONGzYkA8++IDQ0FAqVqzIn3/++UQP9Xft2pWuXbs+sk21atUYMGAAP/74I9HR0TRt2pTdu3cza9YsunXrRvPmzYG/n6169913mTBhAp06daJDhw4cOHCAVatWpXtZMcB7773HsmXL6NSpEwMHDqRWrVrEx8dz5MgRfv/9d0JDQzPs87jefPNNJk2axOeff86CBQse2e6fzJkzh3nz5tG9e3dq1aqFnZ0dJ06c4Ndff8XBwYFRo0ala5+cnMzcuXMfeKzu3bv/44t5e/XqxZQpUxgzZgxVqlShQoUK6ba3adMGLy8vGjZsiKenJydOnGDq1Kl07NgRZ2dn4O/Cr3nz5owZM4axY8c+9Fzz5s2jevXq+Pr6PnB7ly5dGDZsGPv3739onweoXLkybdu2Zfjw4djb26e9vPlxhim2adMGOzs7OnfunPbLjJ9++gkPD48svUsoIgIqrkREDNO/f3/ef//9R97x6d+/P1ZWVkyaNImrV69St25dpk6dire392Od08rKimXLlvHWW28xd+5cTCYTXbp0YeLEidSoUeNxLyVTfv75Z0qWLMnMmTNZvHgxXl5ejBw5kjFjxqRr9+mnn+Lg4MAPP/zApk2bqFevHmvXrs3wzignJye2bNnCZ599xqJFi5g9ezYuLi6ULVuWcePGZekkBT4+PvTt25c5c+Zw7tw5SpUq9djHGjRoEE5OTmzYsIGlS5cSGxtL0aJFadOmDSNHjszwfUhMTHxoH7lw4cI/FlcNGjTA19eXixcvPnBI4KBBg5g3bx7ffPMNcXFxFC9enOHDh//jMMj/tX//fk6ePMnHH3/80DadO3dm2LBhzJ0795HFVdOmTQkMDGTcuHGEh4dTsWJFZs6cmWH2zcwoV64cv//+Ox999BHvvvsuXl5eDB48mKJFi/LSSy9ZfDwRkUcxmf93zlcREXkqvvvuO95++21CQ0Oz7PkgkdzOZDIxZMgQpk6danQUERGL6ZkrEREDmM1mfvnlF5o2barCSkREJI/QsEARkacoPj6eZcuWsWnTJo4cOcLSpUuNjiQiIiJZRMWViMhTdO3aNfr27YubmxujRo2iS5cuRkcSERGRLKJnrkRERERERLKAnrkSERERERHJAiquREREREREsoCeuXqA1NRUrly5grOzMyaTyeg4IiIiIiJiELPZzO3bt/Hx8cHK6tH3plRcPcCVK1ce+nZ5ERERERHJfy5evEjx4sUf2UbF1QM4OzsDf38BXVxcDM2SlJTE2rVradOmDba2toZmkdxBfUYspT4jllKfEUupz4ilclKfiY2NxdfXN61GeBQVVw9wfyigi4tLjiiunJyccHFxMbxjSe6gPiOWUp8RS6nPiKXUZ8RSObHPZOZxIU1oISIiIiIikgVUXImIiIiIiGQBFVciIiIiIiJZQMWViIiIiIhIFjC0uJo+fTpVq1ZNmzgiMDCQVatWPXKfRYsWUb58eRwcHKhSpQorV65Mt91sNjN69Gi8vb1xdHSkVatWnDlzJjsvQ0RERERExNjiqnjx4nz++efs27ePvXv30qJFC7p27cqxY8ce2H7Hjh306dOHl19+mQMHDtCtWze6devG0aNH09p8+eWXTJ48mR9++IFdu3ZRoEAB2rZty927d5/WZYmIiIiISD5k6FTsnTt3Trf873//m+nTp7Nz504qVaqUof13331Hu3bteO+99wAYP34869atY+rUqfzwww+YzWYmTZrERx99RNeuXQGYPXs2np6eLFmyhN69ez8wR2JiIomJiWnLsbGxwN9TQCYlJWXJtT6u++c3OofkHuozYin1GbGU+oxYSn1GLJWT+owlGXLMe65SUlJYtGgR8fHxBAYGPrBNSEgII0aMSLeubdu2LFmyBIALFy4QGRlJq1at0ra7urpSr149QkJCHlpcTZgwgXHjxmVYv3btWpycnB7zirLWunXrjI4guYz6jFhKfUYspT4jllKfEUvlhD6TkJCQ6baGF1dHjhwhMDCQu3fvUrBgQRYvXkzFihUf2DYyMhJPT8906zw9PYmMjEzbfn/dw9o8yMiRI9MVbfffwtymTZsc8RLhdevW0bp16xzzAjXJ2dRnxFLqM2Ip9RmxlPqMWCon9Zn7o9oyw/Diqly5chw8eJCYmBh+//13BgwYwJYtWx5aYGUHe3t77O3tM6y3tbU1/Jt5X07KIrmD+oxYSn1GLKU+I5ZSnxFL5YQ+Y8n5DZ+K3c7OjtKlS1OrVi0mTJhAtWrV+O677x7Y1svLi6ioqHTroqKi8PLyStt+f93D2oiIiIiIiGQHw4ur/5Wamppucon/FhgYyIYNG9KtW7duXdozWiVKlMDLyytdm9jYWHbt2vXQ57hERERERESygqHDAkeOHEn79u3x8/Pj9u3bBAUFsXnzZtasWQNA//79KVasGBMmTADgzTffpGnTpkycOJGOHTuyYMEC9u7dy48//giAyWTirbfe4tNPP6VMmTKUKFGCjz/+GB8fH7p162bUZYqIiIiISD5gaHF19epV+vfvT0REBK6urlStWpU1a9bQunVrAMLDw7Gy+r+baw0aNCAoKIiPPvqIUaNGUaZMGZYsWULlypXT2rz//vvEx8fz2muvER0dTaNGjVi9ejUODg5P/fqeVEqqmV0XbrLvuonCF24SWNoDayuT0bFEREREROQBDC2ufvnll0du37x5c4Z1PXr0oEePHg/dx2Qy8cknn/DJJ588aTxDrT4awbi/jhMRcxewZvaZvXi7OjCmc0XaVfY2Op6IiIiIiPyPHPfMlfxdWA2eu///F1b/JzLmLoPn7mf10QiDkomIiIiIyMOouMphUlLNjPvrOOYHbLu/btxfx0lJfVALERERERExioqrHGb3hZsZ7lj9NzMQEXOX3RduPr1QIiIiIiLyj1Rc5TBXbz+8sHqcdiIiIiIi8nSouMphPJwzN6thZtuJiIiIiMjToeIqh6lbwh1vVwf+acL1NcciSUpJfSqZRERERETkn6m4ymGsrUyM6VwRIEOB9d/LM3eE0vvHnUTE3Hlq2URERERE5OFUXOVA7Sp7M/2Fmni5ph/65+XqwA8v1OTHfrVwdrBhX9gtOk0OJvjMdYOSioiIiIjIfYa+RFgerl1lb1pX9CLk7FXWbttFm8b1CCztgbXV3/evlns5M3jufo5HxNLv112MaFWWIc1LY2X1TwMKRUREREQkO+jOVQ5mbWWiXgl3ahUxU6+Ee1phBeBfuAB/vtGAXrV9MZth4rrTvDxrD9EJ9wxMLCIiIiKSf6m4ysUcbK354rmqfPlsVextrNh06hodJwdz+FK00dFERERERPIdFVd5QM86vvz5RgP8CztxOfoOz00PYe7OMMxms9HRRERERETyDRVXeUQlH1eWDW1Em4qe3EtJ5aMlRxnx2yES7iUbHU1EREREJF9QcZWHuDra8p9+tRjZvjzWViYWH7hMt2nbOXctzuhoIiIiIiJ5noqrPMZkMjGoaSmCXqlHUWd7TkfF0WVKMCsORxgdTUREREQkT1NxlUfVK1mYFcMbUa+EO/H3UhgStJ9xfx3jXnKq0dFERERERPIkFVd5mIezA/NeqcfrTUsBMGN7KL1/DCEi5o7ByURERERE8h4VV3mcjbUVH7Qvz0/9a+PsYMP+8Gg6Tg4m+Mx1o6OJiIiIiOQpKq7yidYVPVk+rBEVvV24GX+Pfr/uYvKGM6Smarp2EREREZGsoOIqH/EvXIA/32hAr9q+mM3wzbrTvDRrD7fi7xkdTUREREQk11Nxlc842FrzxXNV+fK5qtjbWLH51DU6TQnm8KVoo6OJiIiIiORqKq7yqZ61ffnzjQb4F3bicvQdnpsewpydYZjNGiYoIiIiIvI4VFzlY5V8XPlrWCPaVPTkXkoqHy85ytsLD5JwL9noaCIiIiIiuY6Kq3zOxcGW//SrxagO5bG2MrHk4BW6TdvOuWtxRkcTEREREclVVFwJJpOJ15qUIuiVehR1tud0VBxdpgSz/PAVo6OJiIiIiOQaKq4kTb2ShVkxvBH1S7oTfy+FoUEHGLvsGPeSU42OJiIiIiKS46m4knQ8nB2Y+3I9BjcrBcDMHaH0/jGEiJg7BicTEREREcnZVFxJBjbWVvyrXXl+6l8bZwcb9odH03FyMNvOXDM6moiIiIhIjqXiSh6qdUVPVgxrTCUfF27G36P/r7uZvOEMqamarl1ERERE5H+puJJH8ivsxB+DG9C7ji9mM3yz7jQvztzDrfh7RkcTEREREclRVFzJP3KwtebzZ6vy1XNVsbexYsvpa3SaEsyhi9FGRxMRERERyTFUXEmm9ajty+I3GhJQ2InL0Xfo8UMIc3aGYTZrmKCIiIiIiIorsUhFHxeWDWtE20qe3EtJ5eMlR3l74UES7iUbHU1ERERExFAqrsRiLg62/PBCLT7sUAFrKxNLDl6h69TtnL0aZ3Q0ERERERHDqLiSx2IymXi1SUnmv1ofD2d7zlyNo+vUYJYfvmJ0NBERERERQ6i4kidSt4Q7y4c3on5Jd+LvpTA06ABjlx3jXnKq0dFERERERJ4qFVfyxDycHZj7cj3eaFYKgJk7Qun1YwhXou8YnExERERE5OlRcSVZwsbaivfblefn/rVxcbDhQHg0naYEs+3MNaOjiYiIiIg8FSquJEu1qujJ8mGNqVzMhZvx9+j/626+W3+G1FRN1y4iIiIieZuKK8lyfoWd+P31BvSp64vZDN+uP83AmXu4GX/P6GgiIiIiItlGxZVkCwdbayY8U5Wve1TD3saKraev0XlKMAcvRhsdTUREREQkW6i4kmz1XK3iLBnSkIDCTlyOvkOPH3YwJyQUs1nDBEVEREQkb1FxJdmugrcLy4Y1ol0lL5JSzHy89BhvLTxIfGKy0dFERERERLKMiit5KlwcbJn+Qk0+6lgBaysTSw9eodu07Zy9etvoaCIiIiIiWcLQ4mrChAnUqVMHZ2dnPDw86NatG6dOnXrkPs2aNcNkMmX4dOzYMa3NwIEDM2xv165ddl+O/AOTycQrjUuy4LX6eDjbc+ZqHF2mbuevQ1eMjiYiIiIi8sQMLa62bNnCkCFD2LlzJ+vWrSMpKYk2bdoQHx//0H3+/PNPIiIi0j5Hjx7F2tqaHj16pGvXrl27dO3mz5+f3ZcjmVQnwJ0VwxsTWLIwCfdSGDb/AGOXHeNecqrR0UREREREHpuNkSdfvXp1uuWZM2fi4eHBvn37aNKkyQP3cXd3T7e8YMECnJycMhRX9vb2eHl5ZW1gyTJFne2Z83Jdvll3mu83n2PmjlAOXozm++dr4uPmaHQ8ERERERGLGVpc/a+YmBggYwH1KL/88gu9e/emQIEC6dZv3rwZDw8PChUqRIsWLfj0008pXLjwA4+RmJhIYmJi2nJsbCwASUlJJCUlWXoZWer++Y3OkV3eblmKasVdeO/3Ixy8GE3HyduY2KMKjUsXMTparpXX+4xkPfUZsZT6jFhKfUYslZP6jCUZTOYcMid2amoqXbp0ITo6muDg4Ezts3v3burVq8euXbuoW7du2vr7d7NKlCjBuXPnGDVqFAULFiQkJARra+sMxxk7dizjxo3LsD4oKAgnJ6fHvyjJtBt34dfT1lyKN2HCTNviqbQtbsbKZHQyEREREcnPEhIS6Nu3LzExMbi4uDyybY4prgYPHsyqVasIDg6mePHimdpn0KBBhISEcPjw4Ue2O3/+PKVKlWL9+vW0bNkyw/YH3bny9fXl+vXr//gFzG5JSUmsW7eO1q1bY2tra2iW7JaYlML4lSdZuPcyAI1LF+br56rgXsDO4GS5S37qM5I11GfEUuozYin1GbFUTuozsbGxFClSJFPFVY4YFjh06FCWL1/O1q1bM11YxcfHs2DBAj755JN/bFuyZEmKFCnC2bNnH1hc2dvbY29vn2G9ra2t4d/M+3JSluxia2vLF89Vp06JIny05Ajbzt6g+/SdfP9CLar7uhkdL9fJD31Gspb6jFhKfUYspT4jlsoJfcaS8xs6W6DZbGbo0KEsXryYjRs3UqJEiUzvu2jRIhITE3nhhRf+se2lS5e4ceMG3t7eTxJXnpLnahVn8RsNKVGkAFdi7tLjhx3MDgklh9xkFRERERF5IEOLqyFDhjB37lyCgoJwdnYmMjKSyMhI7ty5k9amf//+jBw5MsO+v/zyC926dcswSUVcXBzvvfceO3fuJDQ0lA0bNtC1a1dKly5N27Zts/2aJGtU8HZh6dCGtKvkRVKKmdFLj/HmgoPEJyYbHU1ERERE5IEMLa6mT59OTEwMzZo1w9vbO+2zcOHCtDbh4eFERESk2+/UqVMEBwfz8ssvZzimtbU1hw8fpkuXLpQtW5aXX36ZWrVqsW3btgcO/ZOcy8XBlukv1OSjjhWwtjKx7NAVuk7bztmrt42OJiIiIiKSgaHPXGVmmNfmzZszrCtXrtxD93V0dGTNmjVPGk1yCJPJxCuNS1LN142hQfs5ezWOLlO38/mzVelSzcfoeCIiIiIiaQy9cyWSWXUC3Fk+rDENShUm4V4Kw+cfYMzSo9xLTjU6moiIiIgIoOJKcpGizvbMebkeQ5qXAmBWSBg9/xPC5eg7/7CniIiIiEj2U3EluYq1lYn32pbnlwG1cXGw4eDFaDpN3saW09eMjiYiIiIi+ZyKK8mVWlbwZMXwxlQu5sKthCQGztjNt+tOk5Kq6dpFRERExBgqriTX8nV34vfXG9Cnrh9mM3y34QwDZ+zmZvw9o6OJiIiISD6k4kpyNQdbayY8U4WJParhYGvFtjPX6TR5GwfCbxkdTURERETyGRVXkic8W6s4S4Y0pESRAlyJuUvP/4Qwa0dopqb7FxERERHJCiquJM8o7+XCsqENaV/Zi6QUM2OWHWP4goPEJyYbHU1ERERE8gEVV5KnODvY8v3zNfmoYwVsrEz8degKXadt5+zV20ZHExEREZE8TsWV5Dkmk4lXGpdkwWv18XSx5+zVOLpM3c7Sg5eNjiYiIiIieZiKK8mzage4s2J4YxqUKkzCvRTeXHCQ0UuPkpicYnQ0EREREcmDVFxJnlakoD1zXq7H0OalAZgdEkbP/+zkcvQdg5OJiIiISF6j4kryPGsrE++2LcevA2vj6mjLoYvRdJq8jS2nrxkdTURERETyEBVXkm+0KO/J8mGNqFLMlVsJSQycsZtv1p0mJVXTtYuIiIjIk1NxJfmKr7sTi14PpG89P8xmmLzhDANn7OZm/D2jo4mIiIhILqfiSvIdB1trPutehW96VsPB1optZ67TcfI29offMjqaiIiIiORiKq4k33qmZnGWDGlIiSIFiIi5S6//hDBz+wXMZg0TFBERERHLqbiSfK28lwvLhjakQxUvklLMjP3rOMMXHCQ+MdnoaCIiIiKSy6i4knzP2cGWaX1r8nGnithYmfjr0BW6TA3mTNRto6OJiIiISC6i4koEMJlMvNyoBAteq4+niz3nrsXTddp2lh68bHQ0EREREcklVFyJ/JfaAe6sGN6YhqULk3AvhTcXHOTjJUdJTE4xOpqIiIiI5HAqrkT+R5GC9sx+qR7DWpQGYM7OMHr+ZyeXbiUYnExEREREcjIVVyIPYG1l4p025ZgxsA6ujrYcuhhNpynBbD511ehoIiIiIpJDqbgSeYTm5T1YPqwRVYq5Ep2QxIsz9/DNutOkpGq6dhERERFJT8WVyD/wdXdi0euBPF/PD7MZJm84w8AZu7kRl2h0NBERERHJQVRciWSCg601/+5ehW96VsPB1optZ67TaUow+8NvGR1NRERERHIIFVciFnimZnGWDmlEySIFiIi5S6//hDBz+wXMZg0TFBEREcnvVFyJWKiclzNLhzakQxUvklLMjP3rOMPmHyAuMdnoaCIiIiJiIBVXIo/B2cGWaX1rMrpTRWysTCw/HEHXqcGcjrptdDQRERERMYiKK5HHZDKZeKlRCRYOqo+XiwPnrsXTdep2lh68bHQ0ERERETGAiiuRJ1TL353lwxvRsHRh7iSl8OaCg3y85CiJySlGRxMRERGRp0jFlUgWKFLQntkv1WNYi9IAzNkZRs8fQrh0K8HgZCIiIiLytKi4Eski1lYm3mlTjhkv1sHNyZZDl2LoNCWYTaeuGh1NRERERJ4CFVciWax5OQ/+GtqIqsVdiU5I4qWZe/hm7SlSUjVdu4iIiEhepuJKJBv4ujux6PVAnq/nh9kMkzeeZeCM3dyISzQ6moiIiIhkExVXItnE3saaf3evwre9quFoa822M9fpNCWYfWG3jI4mIiIiItlAxZVINuteozhLhjSkZNECRMTcpdd/Qpix/QJms4YJioiIiOQlKq5EnoJyXs4sG9qIjlW8SU41M+6v4wydf4C4xGSjo4mIiIhIFlFxJfKUFLS3YWrfGozuVBEbKxMrDkfQZWowp6NuGx1NRERERLKAiiuRp8hkMvFSoxIsHBSIl4sD56/F03XqdpYcuGx0NBERERF5QiquRAxQy78QK4Y3olHpItxJSuGthQf5aMkREpNTjI4mIiIiIo9JxZWIQQoXtGfWS3UZ3qI0AHN3htPzhxAu3UowOJmIiIiIPA4VVyIGsrYyMaJNOWa8WAc3J1sOXYqh4+RgNp28anQ0EREREbGQiiuRHKB5OQ+WD2tE1eKuxNxJ4sWZe5i49hQpqZquXURERCS3UHElkkMUL+TEotcDeaG+HwBTNp5lwK+7uRGXaHAyEREREckMQ4urCRMmUKdOHZydnfHw8KBbt26cOnXqkfvMnDkTk8mU7uPg4JCujdlsZvTo0Xh7e+Po6EirVq04c+ZMdl6KSJawt7Hm025VmNSrOo621gSfvU7HycHsC7tldDQRERER+QeGFldbtmxhyJAh7Ny5k3Xr1pGUlESbNm2Ij49/5H4uLi5ERESkfcLCwtJt//LLL5k8eTI//PADu3btokCBArRt25a7d+9m5+WIZJluNYqxdGhDShYtQGTsXXr9J4Rfgy9gNmuYoIiIiEhOZWPkyVevXp1ueebMmXh4eLBv3z6aNGny0P1MJhNeXl4P3GY2m5k0aRIfffQRXbt2BWD27Nl4enqyZMkSevfunXUXIJKNyno6s2xoI/71x2FWHI7gk+XH2Rd+iy+erUpBe0P/6oqIiIjIA+Son9BiYmIAcHd3f2S7uLg4/P39SU1NpWbNmnz22WdUqlQJgAsXLhAZGUmrVq3S2ru6ulKvXj1CQkIeWFwlJiaSmPh/z7XExsYCkJSURFJS0hNf15O4f36jc4gx7K3g2+cqU6O4C5+vPs2KwxGcuBLD1N7VKeNZ8IH7qM+IpdRnxFLqM2Ip9RmxVE7qM5ZkMJlzyDij1NRUunTpQnR0NMHBwQ9tFxISwpkzZ6hatSoxMTF8/fXXbN26lWPHjlG8eHF27NhBw4YNuXLlCt7e3mn79ezZE5PJxMKFCzMcc+zYsYwbNy7D+qCgIJycnLLmAkWe0IXbMPO0NdH3TNhZmelZMpU6RXPEX18RERGRPCshIYG+ffsSExODi4vLI9vmmOJq8ODBrFq1iuDgYIoXL57p/ZKSkqhQoQJ9+vRh/Pjxj1VcPejOla+vL9evX//HL2B2S0pKYt26dbRu3RpbW1tDs4jxbsTf451FR9h+7gYAfeoU58MO5bG3+b/HJ9VnxFLqM2Ip9RmxlPqMWCon9ZnY2FiKFCmSqeIqRwwLHDp0KMuXL2fr1q0WFVYAtra21KhRg7NnzwKkPYsVFRWVrriKioqievXqDzyGvb099vb2Dzy20d/M+3JSFjGOl5sts1+ux3cbzjBl4xnm77nEsYjbTOtbE1/39HdZ1WfEUuozYin1GbGU+oxYKif0GUvOb+hsgWazmaFDh7J48WI2btxIiRIlLD5GSkoKR44cSSukSpQogZeXFxs2bEhrExsby65duwgMDMyy7CJGsbYyMaJ1WWYMrIObky2HL8XQaUowm05eNTqaiIiISL5maHE1ZMgQ5s6dS1BQEM7OzkRGRhIZGcmdO3fS2vTv35+RI0emLX/yySesXbuW8+fPs3//fl544QXCwsJ45ZVXgL9nEnzrrbf49NNPWbZsGUeOHKF///74+PjQrVu3p32JItmmWTkPlg9rRLXirsTcSeLFmXv4es0pUlJzxEhfERERkXzH0GGB06dPB6BZs2bp1s+YMYOBAwcCEB4ejpXV/9WAt27d4tVXXyUyMpJChQpRq1YtduzYQcWKFdPavP/++8THx/Paa68RHR1No0aNWL16dYaXDYvkdsULOfHb64F8uvwEc3aGMXXTWfaH3aTDoyfcFBEREZFsYGhxlZm5NDZv3pxu+dtvv+Xbb7995D4mk4lPPvmETz755EniieQK9jbWjO9WmdoBhfjgjyPsOH+TY5esKVMjmnqlihodT0RERCTfMHRYoIhkna7Vi7F0aENKFnEi5p6J53/Zwy/BFzL1SwwREREReXIqrkTykLKezvzxen1qFE4lOdXM+OXHGRp0gNt3jX8Bn4iIiEhep+JKJI8paG/DgDKpfNyxPLbWJlYciaDr1O2cirxtdDQRERGRPE3FlUgeZDJB//p+LBwUiLerA+evx9Nt2nYWH7hkdDQRERGRPEvFlUgeVtOvEMuHNaJxmSLcSUrh7YWHGLX4CHeTUoyOJiIiIpLnqLgSyeMKF7Rn5ot1ebNlGUwmCNoVTo8fQrh4M8HoaCIiIiJ5ioorkXzA2srE263LMmNgHdycbDlyOYZOU4LZeDLK6GgiIiIieYaKK5F8pFk5D1YMb0w1Xzdi7iTx0sy9fL3mFCmpmq5dRERE5EmpuBLJZ4q5OfLboPr0D/QHYOqms/T7ZRfX4xINTiYiIiKSu6m4EsmH7G2s+aRrZb7rXR1HW2t2nLtBx8nb2Bt60+hoIiIiIrmWiiuRfKxr9WIsG9qQUkULEBWbSO8fd/JL8AXMZg0TFBEREbGUiiuRfK6MpzNLhzaiU1VvklPNjF9+nCFB+7l9N8noaCIiIiK5ioorEaGgvQ1T+tRgXJdK2FqbWHkkkq5Tt3MyMtboaCIiIiK5hoorEQHAZDIxoEEACwcF4uPqwPnr8XSbtp0/918yOpqIiIhIrqDiSkTSqelXiOXDG9O4TBHuJqUy4rdDjFp8hLtJKUZHExEREcnRVFyJSAbuBeyY+WJd3mxZBpMJgnaF0+OHEC7eTDA6moiIiEiOpeJKRB7I2srE263LMvPFuhRysuXI5Rg6TQlm48koo6OJiIiI5EgqrkTkkZqWLcry4Y2p5utGzJ0kXpq5l6/WnCQlVdO1i4iIiPw3FVci8o+KuTny26D69A/0B2DapnP0+2UX124nGpxMREREJOdQcSUimWJvY80nXSvzXe/qONlZs+PcDTpN2cbe0JtGRxMRERHJEVRciYhFulYvxtIhDSntUZCo2ER6/7iTn7edx2zWMEERERHJ31RciYjFyng6s3RIQzpX8yE51cynK07wxrz93L6bZHQ0EREREcOouBKRx1LA3obJvaszrkslbK1NrDoaSZep2zkZGWt0NBERERFDWFxctWjRgujo6AzrY2NjadGiRVZkEpFcwmQyMaBBAL8NCsTH1YEL1+PpNm07f+y7ZHQ0ERERkafO4uJq8+bN3Lt3L8P6u3fvsm3btiwJJSK5Sw2/Qiwf3pgmZYtyNymVdxYdYuSfR7iblGJ0NBEREZGnxiazDQ8fPpz25+PHjxMZGZm2nJKSwurVqylWrFjWphORXMO9gB0zBtZhysYzfLfhDPN3h3PkcjTTn6+Fr7uT0fFEREREsl2mi6vq1atjMpkwmUwPHP7n6OjIlClTsjSciOQu1lYm3mpVlhp+hXhrwQGOXo6l4+RtfNurOi0reBodT0RERCRbZbq4unDhAmazmZIlS7J7926KFi2ats3Ozg4PDw+sra2zJaSI5C5NyxZl+fDGDJm3n4MXo3l51l7eaFaKEa3LYmOteXREREQkb8p0ceXv7w9AampqtoURkbyjmJsjvw0K5N8rjjMrJIzvN5/jQHg0k/vUoKizvdHxRERERLLcY/0Kec6cOTRs2BAfHx/CwsIA+Pbbb1m6dGmWhhOR3M3OxopxXSszuU8NnOysCTl/g46Tt7En9KbR0URERESynMXF1fTp0xkxYgQdOnQgOjqalJS/ZwMrVKgQkyZNyup8IpIHdKnmw7KhDSntUZCrtxPp/eNOft52HrPZbHQ0ERERkSxjcXE1ZcoUfvrpJz788MN0z1jVrl2bI0eOZGk4Eck7Sns4s3RIQ7pU8yEl1cynK07wxrz93L6bZHQ0ERERkSxhcXF14cIFatSokWG9vb098fHxWRJKRPKmAvY2fNe7Op90rYSttYlVRyPpMnU7JyNjjY4mIiIi8sQsLq5KlCjBwYMHM6xfvXo1FSpUyIpMIpKHmUwm+gcG8NugQHxcHbhwPZ5u07bz+75LRkcTEREReSKZni3wvhEjRjBkyBDu3r2L2Wxm9+7dzJ8/nwkTJvDzzz9nR0YRyYNq+BVixfDGvLXwIFtOX+PdRYfYF3aTMZ0r4WCr1zqIiIhI7mNxcfXKK6/g6OjIRx99REJCAn379sXHx4fvvvuO3r17Z0dGEcmjChWwY8bAOkzZeJZJG04zf/dFjlyOYfrztfB1dzI6noiIiIhFHmsq9ueff54zZ84QFxdHZGQkly5d4uWXX87qbCKSD1hZmXizVRlmvViXQk62HL0cS8fJ21h/PMroaCIiIiIWeazi6j4nJydOnDjBqlWruHXrVlZlEpF8qEnZoqwY3pjqvm7E3k3mldl7+WL1SZJT9OJyERERyR0yXVx98cUXfPzxx2nLZrOZdu3a0bx5czp27EiFChU4duxYtoQUkfzBx82R3wYFMrBBAADTN5+j3y+7uXY70dhgIiIiIpmQ6eJq4cKFVK5cOW35999/Z+vWrWzbto3r169Tu3Ztxo0bly0hRST/sLOxYmyXSkzuUwMnO2tCzt+g4+Rt7L5w0+hoIiIiIo+U6eLqwoULVK1aNW155cqVPPfcczRs2BB3d3c++ugjQkJCsiWkiOQ/Xar5sGxoQ0p7FOTq7UT6/LSTn7aex2w2Gx1NRERE5IEyXVwlJydjb2+fthwSEkKDBg3Sln18fLh+/XrWphORfK20hzNLhzSka3UfUlLN/HvlCQbP3U/s3SSjo4mIiIhkkOniqlSpUmzduhWA8PBwTp8+TZMmTdK2X7p0icKFC2d9QhHJ1wrY2zCpV3XGd62ErbWJ1cci6TIlmBMRsUZHExEREUkn08XVkCFDGDp0KC+//DLt27cnMDCQihUrpm3fuHEjNWrUyJaQIpK/mUwm+gUGsOj1BhRzcyT0RgLdv9/O7/suGR1NREREJE2mi6tXX32VyZMnc/PmTZo0acIff/yRbvuVK1d46aWXLDr5hAkTqFOnDs7Oznh4eNCtWzdOnTr1yH1++uknGjduTKFChShUqBCtWrVi9+7d6doMHDgQk8mU7tOuXTuLsolIzlPd143lwxrRtGxR7ial8u6iQ3zwx2HuJqUYHU1ERETEsvdcvfTSSyxevJjp06fj5eWVbtv3339P9+7dLTr5li1bGDJkCDt37mTdunUkJSXRpk0b4uPjH7rP5s2b6dOnD5s2bSIkJARfX1/atGnD5cuX07Vr164dERERaZ/58+dblE1EcqZCBeyYMbAOI1qXxWSCBXsu8uz0HYTfSDA6moiIiORzNkaefPXq1emWZ86ciYeHB/v27Uv3PNd/mzdvXrrln3/+mT/++IMNGzbQv3//tPX29vYZCkARyRusrEwMb1mGGn5uvLngIMeuxNJpyja+6VmdVhU9jY4nIiIi+ZShxdX/iomJAcDd3T3T+yQkJJCUlJRhn82bN+Ph4UGhQoVo0aIFn3766UMn3EhMTCQx8f9eUhob+/eD8klJSSQlGTsr2f3zG51Dco/81GfqB7ixZHB9hi88xMGLMbwyey+DGpfgrZalsLG26MZ8vpaf+oxkDfUZsZT6jFgqJ/UZSzKYzDnkpTGpqal06dKF6OhogoODM73fG2+8wZo1azh27BgODg4ALFiwACcnJ0qUKMG5c+cYNWoUBQsWJCQkBGtr6wzHGDt27ANfgBwUFISTk9PjX5SIPBXJqbA0zIqtkX8XVKVdUhlQJhUXO4ODiYiISK6XkJBA3759iYmJwcXF5ZFtc0xxNXjwYFatWkVwcDDFixfP1D6ff/45X375JZs3b073guP/df78eUqVKsX69etp2bJlhu0PunPl6+vL9evX//ELmN2SkpJYt24drVu3xtbW1tAskjvk5z6z4kgkHy45Rvy9FDyc7ZnUsyp1AgoZHSvHy899Rh6P+oxYSn1GLJWT+kxsbCxFihTJVHGVI4YFDh06lOXLl7N169ZMF1Zff/01n3/+OevXr39kYQVQsmRJihQpwtmzZx9YXNnb26d7QfJ9tra2hn8z78tJWSR3yI99pltNXyoXL8Tgufs4czWOfjP28q925Xi1cUlMJpPR8XK8/Nhn5Mmoz4il1GfEUjmhz1hyfosfSoiPj+fjjz+mQYMGlC5dmpIlS6b7WMJsNjN06FAWL17Mxo0bKVGiRKb2+/LLLxk/fjyrV6+mdu3a/9j+0qVL3LhxA29vb4vyiUjuU9qjIEuGNKRrdR9SUs18tvIkr8/dR+xd48dsi4iISN5m8Z2rV155hS1bttCvXz+8vb2f6LfBQ4YMISgoiKVLl+Ls7ExkZCQArq6uODo6AtC/f3+KFSvGhAkTAPjiiy8YPXo0QUFBBAQEpO1TsGBBChYsSFxcHOPGjePZZ5/Fy8uLc+fO8f7771O6dGnatm372FlFJPcoYG/DpF7VqR3gzvi/jrPmWBSnIoP5/vlaVPQxdqiviIiI5F0WF1erVq1ixYoVNGzY8IlPPn36dACaNWuWbv2MGTMYOHAgAOHh4VhZWaXb5969ezz33HPp9hkzZgxjx47F2tqaw4cPM2vWLKKjo/Hx8aFNmzaMHz/+gUP/RCRvMplM9KvvT9Virrwxbz+hNxLo/v12Pu1WmR61fY2OJyIiInmQxcVVoUKFLJoq/VEyM5fG5s2b0y2HhoY+sr2joyNr1qx5glQikpdU83Vj+bBGvP3bQTafusZ7vx9mX9gtxnaphINtxtlDRURERB6Xxc9cjR8/ntGjR5OQkJAdeUREslyhAnb8OqAO77Qui8kEC/Zc5NnpOwi/of+PiYiISNax+M7VxIkTOXfuHJ6engQEBGSYPWP//v1ZFk5EJKtYWZkY1rIMNfwKMXzBAY5diaXjlG1807M6rSt6Gh1PRERE8gCLi6tu3bplQwwRkaejUZkirBjeiCHz9rM/PJpXZ+/l9aaleLdNWWysLb6ZLyIiIpLG4uJqzJgx2ZFDROSp8XZ1ZMFrgUxYdYIZ20P5Ycs5DoTfYkrfGng4OxgdT0RERHKpx/417b59+5g7dy5z587lwIEDWZlJRCTb2dlYMaZzJab2rUEBO2t2XbhJx8nB7Dp/w+hoIiIikktZfOfq6tWr9O7dm82bN+Pm5gZAdHQ0zZs3Z8GCBRQtWjSrM4qIZJtOVX0o7+XCG/P2cToqjr4/7+Jf7crxauOST/QePxEREcl/LL5zNWzYMG7fvs2xY8e4efMmN2/e5OjRo8TGxjJ8+PDsyCgikq1KexRkyZCGdKvuQ0qqmc9WnmTQnH3E3k0yOpqIiIjkIhYXV6tXr+b777+nQoUKaesqVqzItGnTWLVqVZaGExF5WpzsbPi2V3U+7VYZO2sr1h6PovOUYI5fiTU6moiIiOQSFhdXqampGaZfB7C1tSU1NTVLQomIGMFkMvFCfX8WvR5IMTdHwm4k0P377fy296LR0URERCQXsLi4atGiBW+++SZXrlxJW3f58mXefvttWrZsmaXhRESMUM3XjeXDGtGsXFESk1N5//fD/Ov3w9xNSjE6moiIiORgFhdXU6dOJTY2loCAAEqVKkWpUqUoUaIEsbGxTJkyJTsyiog8dYUK2PHrgDq807osJhMs3HuRZ77fQdiNeKOjiYiISA5l8WyBvr6+7N+/n/Xr13Py5EkAKlSoQKtWrbI8nIiIkaysTAxrWYYafoV4c8EBjkfE0mlKMBN7VKNNJS+j44mIiEgOY3FxBX8/l9C6dWtat26d1XlERHKcRmWKsHx4I4bM28/+8Ghem7OPQU1L8l6bcthYP/brAkVERCSPyVRxNXnyZF577TUcHByYPHnyI9tqOnYRyYu8XR1Z8FogE1adYMb2UP6z5TwHw6OZ0rcGHs4ORscTERGRHCBTxdW3337L888/j4ODA99+++1D25lMJhVXIpJn2dlYMaZzJWr7u/P+74fYdeEmHScHM6VPDeqXLGx0PBERETFYpoqrCxcuPPDPIiL5Uceq3pT3dmbw3H2cjorj+Z938X7bcrzWpCQmk8noeCIiImIQix8W+OSTT0hISMiw/s6dO3zyySdZEkpEJKcrVbQgS4Y0pHuNYqSkmpmw6iSD5uwj5k6S0dFERETEIBYXV+PGjSMuLi7D+oSEBMaNG5cloUREcgMnOxu+6VmNT7tVxs7airXHo+gyNZhjV2KMjiYiIiIGsLi4MpvNDxz2cujQIdzd3bMklIhIbmEymXihvj+/Dw6kmJsjYTcSeOb7Hfy256LR0UREROQpy/RU7IUKFcJkMmEymShbtmy6AislJYW4uDhef/31bAkpIpLTVS3uxorhjXh74UE2nbrG+38cZm/YTT7pWhkHW2uj44mIiMhTkOniatKkSZjNZl566SXGjRuHq6tr2jY7OzsCAgIIDAzMlpAiIrmBm5Mdvwyow/ebz/LNutP8tvcSRy/HMv2FmvgXLmB0PBEREclmmS6uBgwYAECJEiVo0KABtra22RZKRCS3srIyMbRFGWr4FWL4/AMcj4il05Rgvu5RjbaVvIyOJyIiItnI4meumjZtmlZY3b17l9jY2HQfERGBhqWLsGJ4Y2r5F+L23WQGzdnHhJUnSE5JNTqaiIiIZBOLi6uEhASGDh2Kh4cHBQoUoFChQuk+IiLyNy9XBxa8Vp+XGpYA4D9bz9P3511cjb1rcDIRERHJDhYXV++99x4bN25k+vTp2Nvb8/PPPzNu3Dh8fHyYPXt2dmQUEcm1bK2tGN25ItP61qSAnTW7L9ykw+Rgdp6/YXQ0ERERyWIWF1d//fUX33//Pc8++yw2NjY0btyYjz76iM8++4x58+ZlR0YRkVyvY1Vvlg1rRFnPglyPS+T5n3fxw5ZzmM1mo6OJiIhIFrG4uLp58yYlS5YEwMXFhZs3bwLQqFEjtm7dmrXpRETykFJFC7JkSEOeqVGMlFQzn686yWtz9hFzJ8noaCIiIpIFLC6uSpYsyYULFwAoX748v/32G/D3HS03N7csDSciktc42dkwsWc1/t29MnbWVqw7HkWXqcEcuxJjdDQRERF5QhYXVy+++CKHDh0C4IMPPmDatGk4ODjw9ttv895772V5QBGRvMZkMvF8PX9+HxxIMTdHwm4k0P37HSzcE250NBEREXkCmX7P1X1vv/122p9btWrFyZMn2bdvH6VLl6Zq1apZGk5EJC+rWtyNFcMbMeK3Q2w8eZV//XGEvaG3GN+tMg621kbHExEREQtZXFz9L39/f/z9/bMii4hIvuPmZMfP/Wszfcs5Jq49xaJ9lzh6JZbpz9ckoEgBo+OJiIiIBTJVXE2ePDnTBxw+fPhjhxERyY+srEwMaV6a6r5uDJ9/gBMRsXSeEszXPavRtpKX0fFEREQkkzJVXH377bfplq9du0ZCQkLaBBbR0dE4OTnh4eGh4kpE5DE1LF2EFcMbMyRoP/vCbjFozj5ea1KS99uWw8ba4kdkRURE5CnL1L/WFy5cSPv8+9//pnr16pw4cYKbN29y8+ZNTpw4Qc2aNRk/fnx25xURydO8XB1Y8Fp9Xm5UAoAft56n70+7uBp71+BkIiIi8k8s/lXoxx9/zJQpUyhXrlzaunLlyvHtt9/y0UcfZWk4EZH8yNbaio87VeT752tS0N6G3aE36TA5mJBzN4yOJiIiIo9gcXEVERFBcnJyhvUpKSlERUVlSSgREYEOVbxZNrQh5TyduR6XyPM/72T65nOYzWajo4mIiMgDWFxctWzZkkGDBrF///60dfv27WPw4MG0atUqS8OJiOR3JYsWZPGQBjxToxipZvhi9Ulenb2PmDtJRkcTERGR/2FxcfXrr7/i5eVF7dq1sbe3x97enrp16+Lp6cnPP/+cHRlFRPI1JzsbJvasxmfdq2BnbcX6E1F0nhLM0csxRkcTERGR/2Lxe66KFi3KypUrOX36NCdPngSgfPnylC1bNsvDiYjI30wmE33r+VGlmCuD5+0j/GYCz0zfwfiulehVx8/oeCIiIsITvES4bNmyKqhERJ6yKsVdWT6sESN+O8TGk1f51x9H2Bt6i0+6VsbRztroeCIiIvlapoqrESNGMH78eAoUKMCIESMe2fabb77JkmAiIvJgbk52/Ny/NtO3nGPi2lMs2neJI5dj+OGFWgQUKWB0PBERkXwrU8XVgQMHSEpKSvvzw5hMpqxJJSIij2RlZWJI89LU8HVj+IIDnIy8TecpwXzVoxrtKnsZHU9ERCRfylRxtWnTpgf+WUREjNWgdBGWD2vM0KD97A27xetz9/Fak5K817YcttYWz1kkIiIiT0D/8oqI5HJerg7Mf60+rzQqAcCPW8/z/E+7iIq9a3AyERGR/CVTd66eeeaZTB/wzz//fOwwIiLyeGytrfioU0Vq+Rfivd8Pszv0Jh0nBzOlTw0CSxU2Op6IiEi+kKk7V66urpn+WGLChAnUqVMHZ2dnPDw86NatG6dOnfrH/RYtWkT58uVxcHCgSpUqrFy5Mt12s9nM6NGj8fb2xtHRkVatWnHmzBmLsomI5Ebtq3izbGhDyns5cz0uked/3sn3m8+Smmo2OpqIiEiel6k7VzNmzMiWk2/ZsoUhQ4ZQp04dkpOTGTVqFG3atOH48eMUKPDgGa927NhBnz59mDBhAp06dSIoKIhu3bqxf/9+KleuDMCXX37J5MmTmTVrFiVKlODjjz+mbdu2HD9+HAcHh2y5FhGRnKJk0YIsfqMhHy45wp/7L/Pl6lPsD4tmYo9quDrZGh1PREQkzzL0mavVq1czcOBAKlWqRLVq1Zg5cybh4eHs27fvoft89913tGvXjvfee48KFSowfvx4atasydSpU4G/71pNmjSJjz76iK5du1K1alVmz57NlStXWLJkyVO6MhERYznaWTOxRzUmPFMFOxsr1p+IotPUbRy9HGN0NBERkTzrsV4i/Pvvv/Pbb78RHh7OvXv30m3bv3//Y4eJifn7H313d/eHtgkJCcnwrq22bdumFU4XLlwgMjKSVq1apW13dXWlXr16hISE0Lt37wzHTExMJDExMW05NjYWgKSkpLQp6I1y//xG55DcQ31G/ttzNbwp71GAYQsOcvHmHZ6ZvoMxHcvTo1axtNdnqM+IpdRnxFLqM2KpnNRnLMlgcXE1efJkPvzwQwYOHMjSpUt58cUXOXfuHHv27GHIkCGWHi5Namoqb731Fg0bNkwb3vcgkZGReHp6plvn6elJZGRk2vb76x7W5n9NmDCBcePGZVi/du1anJycLLqO7LJu3TqjI0guoz4j/21IGZh71opjt+DDpcdZuuMoPUqkYmf9f23UZ8RS6jNiKfUZsVRO6DMJCQmZbmtxcfX999/z448/0qdPH2bOnMn7779PyZIlGT16NDdv3rT0cGmGDBnC0aNHCQ4OfuxjPK6RI0emuxsWGxuLr68vbdq0wcXF5ann+W9JSUmsW7eO1q1bY2urZyXkn6nPyMM8k2rmx20X+HbDWXZfsyLWyoUpfapRzMVOfUYsov/PiKXUZ8RSOanP3B/VlhkWF1fh4eE0aNAAAEdHR27fvg1Av379qF+/ftqzT5YYOnQoy5cvZ+vWrRQvXvyRbb28vIiKikq3LioqCi8vr7Tt99d5e3una1O9evUHHtPe3h57e/sM621tbQ3/Zt6Xk7JI7qA+Iw8yrFU5apUozPD5BzgZFccz03cxoXslQH1GLKc+I5ZSnxFL5YQ+Y8n5LZ7QwsvLK+0OlZ+fHzt37gT+ftbJbLZsql+z2czQoUNZvHgxGzdupESJEv+4T2BgIBs2bEi3bt26dQQGBgJQokQJvLy80rWJjY1l165daW1ERPKzBqWKsGJ4Y2r7F+J2YjJDFxxiSagVSSmpRkcTERHJ1Swurlq0aMGyZcsAePHFF3n77bdp3bo1vXr1onv37hYda8iQIcydO5egoCCcnZ2JjIwkMjKSO3fupLXp378/I0eOTFt+8803Wb16NRMnTuTkyZOMHTuWvXv3MnToUABMJhNvvfUWn376KcuWLePIkSP0798fHx8funXrZunliojkSZ4uDsx/rT6vNPr7l1qbIqzoP2MvUbF3DU4mIiKSe2V6WODy5cvp0KEDP/74I6mpf/92c8iQIRQuXJgdO3bQpUsXBg0aZNHJp0+fDkCzZs3SrZ8xYwYDBw4E/h6GaGX1fzVggwYNCAoK4qOPPmLUqFGUKVOGJUuWpJsE4/333yc+Pp7XXnuN6OhoGjVqxOrVq/WOKxGR/2JrbcVHnSpSvbgL7yw6yN6waDpO3sbkPjVoUKqI0fFERERynUwXV926dcPT05OBAwfy0ksvUapUKQB69+79wOnNMyMzwwg3b96cYV2PHj3o0aPHQ/cxmUx88sknfPLJJ4+VS0QkP2lbyZOIkyn8HuHGqag4Xvh5F++2LcfrTUphZWUyOp6IiEiukelhgRcuXGDQoEEsWLCAsmXL0rRpU+bMmZNuCJ+IiOROHo6w6LV6PFuzOKlm+HL1KV6bs5eYBOPfLyIiIpJbZLq48vX1ZfTo0Zw7d47169cTEBDA4MGD8fb25vXXX2fPnj3ZmVNERLKZo501X/eoyoRnqmBnY8X6E1fpNHUbRy/HGB1NREQkV7B4QguA5s2bM2vWLCIiIvjqq684cuQI9evXp1q1almdT0REniKTyUSfun78ObgBvu6OXLx5h2em72D+7nCLZ4QVERHJbx6ruLrP2dmZli1b0rx5c9zc3Dh+/HhW5RIREQNVLubK8qGNaVXBg3vJqYz88wjvLjrMnXspRkcTERHJsR6ruLpz5w6zZ8+mWbNmlClThgULFjBixAhCQ0OzOJ6IiBjF1cmWH/vV5v125bAywR/7L9H9++1cuB5vdDQREZEcKdOzBQLs3LmTX3/9ld9++4179+7xzDPPsH79epo3b55d+URExEBWVibeaFaa6r5uDJ9/gJORt+k8JZive1SlXWVvo+OJiIjkKJm+c1WxYkUaNmzI/v37mTBhAhEREcydO1eFlYhIPtCgVBFWDG9MnYBCxCUm8/rc/Xy6/DhJKalGRxMREckxMl1ctWrViv3797N3714GDx6Mq6trduYSEZEcxtPFgaBX6/Nq4xIA/Bx8gb4/7SQq9q7ByURERHKGTBdXkydP1myAIiL5nK21FR92rMgPL9TE2d6GPaG36Dh5GzvOXTc6moiIiOGeaLZAERHJn9pV9mbZsEaU93Lmetw9Xvh5F9M2nSU1VdO1i4hI/qXiSkREHkuJIgVY/EZDnqtVnFQzfLXmFK/O3ktMQpLR0URERAyh4kpERB6bo501Xz1Xlc+fqYKdjRUbTl6l09RtHL0cY3Q0ERGRp07FlYiIPBGTyUTvun78ObgBvu6OXLx5h2em7yBoVzhms4YJiohI/mHRe67u27BhAxs2bODq1aukpqafhvfXX3/NkmAiIpK7VC7myvKhjXln0SHWn4hi1OIj7A27yb+7VcHRztroeCIiItnO4jtX48aNo02bNmzYsIHr169z69atdB8REcm/XJ1s+bFfLf7VrjxWJvhz/2W6f7+d89fijI4mIiKS7Sy+c/XDDz8wc+ZM+vXrlx15REQkl7OyMjG4WSmq+7oxbP4BTkbepsvU7Xz1XFXaV/E2Op6IiEi2sfjO1b1792jQoEF2ZBERkTwksFRhVg5vRN0Ad+ISkxk8bz/jlx8nKSX1n3cWERHJhSwurl555RWCgoKyI4uIiOQxHi4OzHu1Hq81KQnAL8EX6PPjTiJj7hqcTEREJOtZPCzw7t27/Pjjj6xfv56qVatia2ubbvs333yTZeFERCT3s7W2YlSHCtT0K8R7iw6xN+wWnaZsY3LvGjQoXcToeCIiIlnG4uLq8OHDVK9eHYCjR4+m22YymbIklIiI5D3tKntRzsuZwXP3cTLyNi/8sot32pRjcNNSWFnp3w8REcn9LC6uNm3alB05REQkHyhRpABLhjTk4yVHWbTvEl+tOcX+sFt807M6rk62/3wAERGRHOyJXiJ86dIlLl26lFVZREQkH3CwtearHtX44tkq2NlYseHkVTpO2caRSzFGRxMREXkiFhdXqampfPLJJ7i6uuLv74+/vz9ubm6MHz8+wwuFRUREHqZXHT/+HNwAP3cnLt26w7PTdxC0Kxyz2Wx0NBERkcdicXH14YcfMnXqVD7//HMOHDjAgQMH+Oyzz5gyZQoff/xxdmQUEZE8qnIxV/4a1ohWFTy5l5LKqMVHeOe3Q9y5l2J0NBEREYtZ/MzVrFmz+Pnnn+nSpUvauqpVq1KsWDHeeOMN/v3vf2dpQBERydtcHW35qX8t/rP1PF+uPsmfBy5z7Eos01+oScmiBY2OJyIikmkW37m6efMm5cuXz7C+fPny3Lx5M0tCiYhI/mIymXi9aSmCXq1PkYL2nIq6TZep21l5JMLoaCIiIplmcXFVrVo1pk6dmmH91KlTqVatWpaEEhGR/Kl+ycKsHN6IugHuxCUm88a8/YxffpykFD3TKyIiOZ/FwwK//PJLOnbsyPr16wkMDAQgJCSEixcvsnLlyiwPKCIi+YuHiwNBr9bjqzWn+M/W8/wSfIGDF6OZ1rcmXq4ORscTERF5KIvvXDVt2pTTp0/TvXt3oqOjiY6O5plnnuHUqVM0btw4OzKKiEg+Y2NtxcgOFfhPv1o429uwL+wWHSdvY/vZ60ZHExEReSiL71wB+Pj4aOIKERHJdm0reVFumDOD5+3nREQs/X7ZxYjWZXmjWWmsrExGxxMREUnniV4iLCIikt0CihRg8RsN6FGrOKlm+HrtaV6ZvZfohHtGRxMREUlHxZWIiOR4DrbWfNWjGl8+WxV7Gys2nrxKpynBHL4UbXQ0ERGRNCquREQk1+hZx5c/32iAn7sTl27d4bnpIczbFYbZbDY6moiIiIorERHJXSr5uPLXsEa0rujJvZRUPlx8lHd+O0TCvWSjo4mISD6X6eLq6tWrj9yenJzM7t27nziQiIjIP3F1tOXHfrX4oH15rK1M/HngMt2n7eDctTijo4mISD6W6eLK29s7XYFVpUoVLl68mLZ848aNtPdeiYiIZDeTycTrTUsx75V6FHW251TUbbpO3c7KIxFGRxMRkXwq08XV/45nDw0NJSkp6ZFtREREslv9koVZMawRdUu4E5eYzBvz9vPJX8dJSkk1OpqIiOQzWfrMlcmkd46IiMjT5+HiQNAr9RjUpCQAv26/QO8fdxIRc8fgZCIikp9oQgsREckTbKytGNmhAv/pVwtnBxv2hd2i0+Rggs9cNzqaiIjkE5kurkwmE7dv3yY2NpaYmBhMJhNxcXHExsamfURERIzWtpIXy4c1ooK3Czfi79Hv111M2XCG1FQNXRcRkexlk9mGZrOZsmXLpluuUaNGumUNCxQRkZzAv3ABFr/RgNFLj/Lb3ktMXHea/eG3+LZXddyc7IyOJyIieVSmi6tNmzZlZw4REZEs5WBrzZfPVaO2vzsfLz3KplPX6Dg5mOkv1KRqcTej44mISB6U6eKqadOm2ZlDREQkW/Ss40ulYi68MW8/YTcSeG56CKM7V+T5en4acSEiIlkq089cJScnk5iYmG5dVFQU48aN4/333yc4ODjLw4mIiGSFSj6uLBvaiDYVPbmXkspHS44y4rdDJNxLNjqaiIjkIZkurl599VWGDx+etnz79m3q1KnDtGnTWLNmDc2bN2flypUWnXzr1q107twZHx8fTCYTS5YseWT7gQMHYjKZMnwqVaqU1mbs2LEZtpcvX96iXCIikve4Otryn361GNm+PNZWJhYfuEy3ads5dy3O6GgiIpJHZLq42r59O88++2za8uzZs0lJSeHMmTMcOnSIESNG8NVXX1l08vj4eKpVq8a0adMy1f67774jIiIi7XPx4kXc3d3p0aNHunaVKlVK10531UREBP6e+XZQ01IEvVKPos72nI6Ko8uUYFYcjjA6moiI5AGZfubq8uXLlClTJm15w4YNPPvss7i6ugIwYMAAZsyYYdHJ27dvT/v27TPd3tXVNe18AEuWLOHWrVu8+OKL6drZ2Njg5eVlURYREck/6pUszIrhjRgadIDdF24yJGg/e8MCGNm+AnY2egWkiIg8nkwXVw4ODty5839vut+5c2e6O1UODg7ExT3doRW//PILrVq1wt/fP936M2fO4OPjg4ODA4GBgUyYMAE/P7+HHicxMTHd82T339mVlJREUlJS9oTPpPvnNzqH5B7qM2Kp/NpnCjlYM2tATb5Zf5afgkOZsT2Ug+G3+K5XNbxdHUhJNbM37BZXbyfi4WxPbf9CWFtpAgzIv31GHp/6jFgqJ/UZSzKYzGZzpt6q2LJlS+rWrcuECRPYtm0bzZo149KlS3h7ewOwbt06Bg8ezNmzZx8rtMlkYvHixXTr1i1T7a9cuYKfnx9BQUH07Nkzbf2qVauIi4ujXLlyREREMG7cOC5fvszRo0dxdnZ+4LHGjh3LuHHjMqwPCgrCycnpsa5HRERyjyM3Tcw7a8WdFBMFbMw08kxl1zUrou/9XzHlZmfmmYBUqhXWy4hFRPKThIQE+vbtS0xMDC4uLo9sm+niasuWLbRv3x5vb28iIiLo06cPv/zyS9r2N954g/j4eGbNmvVYoS0triZMmMDEiRO5cuUKdnYPfyFkdHQ0/v7+fPPNN7z88ssPbPOgO1e+vr5cv379H7+A2S0pKYl169bRunVrbG1tDc0iuYP6jFhKfeZvYTcTGDb/ECcibz9w+/0ya0rvarSt5Pn0guVA6jNiKfUZsVRO6jOxsbEUKVIkU8WVRe+52rdvH2vXrsXLyyvDJBLVq1enbt26j5fYQmazmV9//ZV+/fo9srACcHNzo2zZso+8o2Zvb4+9vX2G9ba2toZ/M+/LSVkkd1CfEUvl9z5T2tOV3wc3oPan67mTlJJhu5m/C6x/rzpF+6rFNEQQ9RmxnPqMWCon9BlLzp/p4gqgQoUKVKhQ4YHbXnvtNUsO9US2bNnC2bNnH3on6r/FxcVx7tw5+vXr9xSSiYhIbnb4UswDC6v7zEBEzF12X7hJYKnCTy+YiIjkCpkurrZu3Zqpdk2aNMn0yePi4tLdUbpw4QIHDx7E3d0dPz8/Ro4cyeXLl5k9e3a6/X755Rfq1atH5cqVMxzz3XffpXPnzvj7+3PlyhXGjBmDtbU1ffr0yXQuERHJn67evpupdhdvJRCIiisREUkv08VVs2bNMJn+HgLxsMe0TCYTKSkP/43f/9q7dy/NmzdPWx4xYgTw97TuM2fOJCIigvDw8HT7xMTE8Mcff/Ddd9898JiXLl2iT58+3Lhxg6JFi9KoUSN27txJ0aJFM51LRETyJw9nh0y1G7P0KCciYukfGECJIgWyOZWIiOQWmS6uChUqhLOzMwMHDqRfv34UKVLkiU/erFmzhxZqADNnzsywztXVlYSEhIfus2DBgifOJSIi+VPdEu54uzoQGXOXh/3rZG1l4k5SKjO2/z19e9OyRRnYIICmZYtipeewRETytUy/KTEiIoIvvviCkJAQqlSpwssvv8yOHTtwcXFJe7nvf7/gV0REJLextjIxpnNF4P9mB7zP9P8/U3rXYOaLdWhR3gOTCbacvsaLM/fQfOJmft52npgE49/JIiIixsh0cWVnZ0evXr1Ys2YNJ0+epGrVqgwdOhRfX18+/PBDkpOTszOniIjIU9GusjfTX6iJl2v6IYJerg5Mf6EmHap606ycB78OrMOmd5rxSqMSuDjYEHYjgU9XnKD+hA2M/PMIJyNjDboCERExikWzBd7n5+fH6NGj6devHy+//DKff/4577zzDu7u7lmdT0RE5KlrV9mb1hW92H3hJldv38XD2YG6JdwzTL8eUKQAH3WqyIg2ZVly4AqzQ0I5GXmb+bvDmb87nHol3BnQIIDWFT2xtc707zNFRCSXsri4SkxM5I8//uDXX38lJCSEjh07smLFChVWIiKSp1hbmTI93bqTnQ196/nRp64vuy/cZFZIKGuORbHrwk12XbiJl4sDL9T3o3ddP4oUzPheRRERyRsyXVzt3r2bGTNmsGDBAgICAnjxxRf57bffVFSJiIj8fyaTiXolC1OvZGEiYu4wb+ffd7AiY+/y9drTTN5wlo5VvRnQIIDqvm5GxxURkSyW6eKqfv36+Pn5MXz4cGrVqgVAcHBwhnZdunTJunQiIiK5lLerI++2LcewlqVZeSSCmTvCOHQxmsUHLrP4wGWqFXelf2AAnap5Y29jbXRcERHJAhYNCwwPD2f8+PEP3W7pe65ERETyOnsba7rXKE73GsU5eDGa2SGhLD8UwaFLMbyz6BCfrTxB77q+PF/PHx83R6PjiojIE8h0cZWampqdOURERPK86r5uVPetzqgOFVi45yJzd4YREXOXaZvO8cOW87Sp6En/wADql3THZNI7s0REcpvHmi3wYe7cuYOjo37rJiIi8ihFCtozpHlpBjUpybrjUcwKCWXn+ZusOhrJqqORlPN0pn8Df7rXKIaTXZb+Uy0iItkoS+aFTUxMZOLEiZQoUSIrDiciIpIv2Fhb0b6KNwteC2TNW03oW88PR1trTkXd5sPFR6n32QbGLz9O6PV4o6OKiEgmZLq4SkxMZOTIkdSuXZsGDRqwZMkSAGbMmEGJEiWYNGkSb7/9dnblFBERydPKeTnzWfcq7BzVko87VcS/sBO37ybzS/AFmn29mYEzdrPp1FVSU81GRxURkYfI9FiD0aNH85///IdWrVqxY8cOevTowYsvvsjOnTv55ptv6NGjB9bWmu1IRETkSbg62vJyoxK82CCALWeuMWtHKJtPXUv7+Bd2ol99f3rU9sXV0dbouCIi8l8yXVwtWrSI2bNn06VLF44ePUrVqlVJTk7m0KFDeuhWREQki1lZmWhezoPm5TwIvR7PnJ1h/Lb3ImE3Evh0xQkmrj1N95rFGBAYQDkvZ6PjiogIFgwLvHTpUtr7rSpXroy9vT1vv/22CisREZFsFlCkAB93qsjOkS35d/fKlPN05k5SCkG7wmk7aSu9fwxh1ZEIklM0s6+IiJEyfecqJSUFOzu7/9vRxoaCBQtmSygRERHJqIC9Dc/X86dvXT92nr/J7JBQ1h6PYuf5m+w8fxNvVwdeqO9Przq+FClob3RcEZF8J9PFldlsZuDAgdjb//0/67t37/L6669ToECBdO3+/PPPrE0oIiIi6ZhMJgJLFSawVGGuRN9h3q4w5u++SETMXb5ac4rv1p+hUzVvBgQGUM3Xzei4IiL5RqaLqwEDBqRbfuGFF7I8jIiIiFjGx82R99qWZ1iLMqw8EsGsHaEcuhTDn/sv8+f+y1TzdWNgA386VPHG3kYTT4mIZKdMF1czZszIzhwiIiLyBBxsrXmmZnGeqVmcgxejmbUjlBWHIzh0MZq3F0bz6fIT9Knrx/P1/fB2dTQ6rohInpQlLxEWERGRnKO6rxvf9qrO9g9a8G6bsni5OHAj/h5TN52l0RebeGPePnadv4HZrHdmiYhkpUzfuRIREZHcpaizPUNblGFQ01KsOx7FrB2h7Lpwk5VHIll5JJLyXs70DwygWw0fnOz0I4GIyJPS/0lFRETyOFtrKzpU8aZDFW9ORMQyOySMJQcuczLyNqMWH+HzVSfoWduXfoH++Bcu8M8HFBGRB9KwQBERkXykgrcLE56pws6RLfmoYwX8CzsRezeZn4Mv0Ozrzbw0cw+bT10lNVVDBkVELKU7VyIiIvmQq5MtrzQuyUsNS7Dl9DVmhYSy+dQ1Np68ysaTVwko7ES/wAB61C6Oi4Ot0XFFRHIFFVciIiL5mJWVieblPWhe3oML1+OZExLGor0XCb2RwPjlx5m49hTdaxRjQIMAyno6Gx1XRCRHU3ElIiIiAJQoUoDRnSvyTpuyLD5wmdkhoZyOimPernDm7QonsGRhBjQIoGnpQkZHFRHJkVRciYiISDoF7G14ob4/z9fzI+T8DWbvCGPt8UhCzt8g5PwNvF0dqOVqol78PbzcNGRQROQ+FVciIiLyQCaTiQalitCgVBEuR99h3s4wFuy5SETMXZbHWLP26610rurDgAb+VC3uZnRcERHDabZAERER+UfF3Bx5v115dnzQgi+fqYxvATP3klP5Y/8lukzdTrdp21ly4DKJySlGRxURMYyKKxEREck0B1trutfw4Z0qKSx6rS7daxTD1trEwYvRvLXwIA0/38jEtaeIjLlrdFQRkadOwwJFRETEYiYTVPd1o07JoozqUIEFu8OZuyuMqNhEpmw8y/ebz9GukhcDGgRQJ6AQJpPJ6MgiItlOxZWIiIg8kaLO9gxrWYbXm5Vi7bEoZoWEsvvCTVYciWDFkQjKezkzoEEA3aoXw9HO2ui4IiLZRsWViIiIZAlbays6VvWmY1Vvjl+JZc7OUBYfuMzJyNuM/PMIE1aeoFcdX/rVD8CvsJPRcUVEspyeuRIREZEsV9HHhQnPVGXXyFZ82KECvu6OxN5N5qdtF2j69SZenrmHLaevkZpqNjqqiEiW0Z0rERERyTauTra82qQkLzUqwZbTV5m5I4ytp6+x4eRVNpy8SskiBegX6M+ztYrj4qB3ZolI7qbiSkRERLKdtZWJFuU9aVHek/PX4pizM4zf917i/PV4xv11nK/WnOKZmsUYEBhAGU9no+OKiDwWDQsUERGRp6pk0YKM6VyJkFEtGd+tMmU8CpJwL4W5O8Np/e1W+v60k9VHI0lOSTU6qoiIRXTnSkRERAxR0N6GfvX9eaGeHyHnbjArJJR1x6PYce4GO87doJibI8/X96N3HT/cC9gZHVdE5B+puBIRERFDmUwmGpQuQoPSRbgcfYe5O8NYsDucy9F3+HL1KSatP0OXaj4MCAygSnFXo+OKiDyUiisRERHJMYq5OfKvduV5s2UZ/jp0hVkhoRy9HMvv+y7x+75L1PRzY0CDANpX9sbORk83iEjOouJKREREchwHW2t61PbluVrFOXAxmlk7Qll5JIL94dHsDz/I+IIn6FvPj+fr+eHp4mB0XBERQMWViIiI5GAmk4mafoWo6VeIDztWYMHui8zbFUZUbCKTN5zh+01naVvZi4ENAqjtXwiTyWR0ZBHJx1RciYiISK7g4ezA8JZlGNysFGuORTJ7Rxi7Q2+y4nAEKw5HUMHbhQGB/nStXgxHO2uj44pIPqTiSkRERHIVW2srOlX1oVNVH45diWFOSBhLDl7mREQsH/x5hAmrTtKrji/96vvj6+5kdFwRyUf0JKiIiIjkWpV8XPn82arsHNmSUR3K4+vuSMydJH7cep4mX23ilVl72Hr6GqmpZqOjikg+oDtXIiIikuu5OdnxWpNSvNyoJJtPXWXmjlC2nbnO+hNXWX/iKiWLFqB/fX+erVUcZwdbo+OKSB5l6J2rrVu30rlzZ3x8fDCZTCxZsuSR7Tdv3ozJZMrwiYyMTNdu2rRpBAQE4ODgQL169di9e3c2XoWIiIjkFNZWJlpW8GTOy/XY8E5TBjYIoKC9DeevxTP2r+PU/2wDo5ce5ezV20ZHFZE8yNDiKj4+nmrVqjFt2jSL9jt16hQRERFpHw8Pj7RtCxcuZMSIEYwZM4b9+/dTrVo12rZty9WrV7M6voiIiORgpYoWZGyXSuwc1ZLxXStR2qMg8fdSmB0SRqtvtvL8zztZeyySFA0ZFJEsYuiwwPbt29O+fXuL9/Pw8MDNze2B27755hteffVVXnzxRQB++OEHVqxYwa+//soHH3zwJHFFREQkFypob0O/wABeqO/PjnM3mLUjlPUnoth+9gbbz96gmJsjL9T3p3cdXwoVsDM6rojkYrnymavq1auTmJhI5cqVGTt2LA0bNgTg3r177Nu3j5EjR6a1tbKyolWrVoSEhDz0eImJiSQmJqYtx8bGApCUlERSUlI2XUXm3D+/0Tkk91CfEUupz4ilcnOfqevvSl3/aly6dYf5ey7y297LXI6+wxerTzJp/Wk6VfWiXz0/Kvm4GB01T8nNfUaMkZP6jCUZclVx5e3tzQ8//EDt2rVJTEzk559/plmzZuzatYuaNWty/fp1UlJS8PT0TLefp6cnJ0+efOhxJ0yYwLhx4zKsX7t2LU5OOWMK13Xr1hkdQXIZ9RmxlPqMWCq395lKwIdVYP8NE9sirbgUn8of+6/wx/4rlHA209grlWruZmw0t3KWye19Rp6+nNBnEhISMt02VxVX5cqVo1y5cmnLDRo04Ny5c3z77bfMmTPnsY87cuRIRowYkbYcGxuLr68vbdq0wcXF2N9cJSUlsW7dOlq3bo2trWY3kn+mPiOWUp8RS+W1PtMNMJvNHLwYw+yd4aw+FsWF23DhtjVFC9rRu05xetUujqeLg9FRc6281mck++WkPnN/VFtm5Kri6kHq1q1LcHAwAEWKFMHa2pqoqKh0baKiovDy8nroMezt7bG3t8+w3tbW1vBv5n05KYvkDuozYin1GbFUXuszdUsVpW6polyNvcv83ReZtyuMq7cTmbLpPNO3XKB9FW8GBPpTy78QJpPJ6Li5Ul7rM5L9ckKfseT8uf5G98GDB/H29gbAzs6OWrVqsWHDhrTtqampbNiwgcDAQKMiioiISC7i4eLAm63KEPyvFkzpU4Pa/oVITjXz16ErPPdDCB0nB/PbnovcTUoxOqqI5DCG3rmKi4vj7NmzacsXLlzg4MGDuLu74+fnx8iRI7l8+TKzZ88GYNKkSZQoUYJKlSpx9+5dfv75ZzZu3MjatWvTjjFixAgGDBhA7dq1qVu3LpMmTSI+Pj5t9kARERGRzLCzsaJzNR86V/Ph6OUY5oSEseTgZY5HxPL+H4f5bNUJetX25YX6/vi654xntEXEWIYWV3v37qV58+Zpy/efexowYAAzZ84kIiKC8PDwtO337t3jnXfe4fLlyzg5OVG1alXWr1+f7hi9evXi2rVrjB49msjISKpXr87q1aszTHIhIiIiklmVi7nyxXNV+aB9eX7be5E5O8O4dOsO/9l6nh+3nadleU8GNPCnUekiGjIoko8ZWlw1a9YMs/nhL+6bOXNmuuX333+f999//x+PO3ToUIYOHfqk8URERETSKVTAjkFNS/FK45JsPHmV2SGhbDtznfUnolh/IopSRQvQPzCAZ2sVp6B9rn+0XUQspL/1IiIiIhaytjLRuqInrSt6cvZqHHN3hvH7vkucuxbPmGXH+GrNKZ6tWYx+gQGU9ihodFwReUpy/YQWIiIiIkYq7VGQsV0qETKyBZ90rUSpogWIS0xmVkgYrb7ZQr9fdrHueBQpqQ8frSMieYPuXImIiIhkAWcHW/oHBtCvvj/bz95gVkgo609Ese3MdbaduU7xQo70q+9Pz9q+FCpgZ3RcEckGKq5EREREspDJZKJRmSI0KlOEizcTmLsrjIV7LnLp1h0mrDrJN+tO07W6D/0DA6hczNXouCKShTQsUERERCSb+Lo7MbJ9BXaObMmXz1alorcLicmp/Lb3Ep2mBPPc9B0sO3SFpJRUo6OKSBbQnSsRERGRbOZga03POr70qF2c/eG3mLkjjFVHItgbdou9YbfwcLanbz0/+tbzw8PZwei4IvKYVFyJiIiIPCUmk4la/u7U8nfnascKBO0OZ96ucK7eTmTS+jNM23SW9pW9GdDAn5p+hfTOLJFcRsWViIiIiAE8XBx4q1VZ3mhWmtXHIpm9I5S9YbdYdugKyw5doXIxF/oHBtClmg8OttZGxxWRTNAzVyIiIiIGsrOxoks1H34f3IDlwxrRs3Zx7G2sOHo5lvd/P0zghA18vuokl24lGB1VRP6BiisRERGRHKJyMVe+fK4aO0e25IP25Snm5sithCR+2HKOJl9u4tXZewk+cx2zWe/MEsmJNCxQREREJIcpVMCO15uW4tXGJdlwIorZIWEEn73OuuNRrDseRamiBRjQIIBnahanoL1+nBPJKfS3UURERCSHsrYy0aaSF20qeXH26m1mh4Txx75LnLsWz+ilx/hy9Smeq1WcfoH+lCpa0Oi4IvmehgWKiIiI5AKlPZz5pGtldo5qybgulShZtABxicnM3BFKy4lb6PfLLtYfjyIlVUMGRYyiO1ciIiIiuYizgy0DGgTQr74/289dZ9aOMDacjGLbmetsO3MdX3dH+tX3p2dtX9yc7IyOK5KvqLgSERERyYWsrEw0LlOUxmWKcvFmAnN3hrFgz0Uu3rzDZytPMnHtabpVL8aABgFU9HExOq5IvqBhgSIiIiK5nK+7EyM7VGDnyJZ88WwVKni7kJicysK9F+kweRs9ftjBX4eukJSSanRUkTxNd65ERERE8ghHO2t61fGjZ21f9obdYtaOUFYfjWRP6C32hN7Cw9me5+v506eeLx7ODkbHFclzVFyJiIiI5DEmk4k6Ae7UCXAnKvYu83aFE7QrnKu3E/l2/WmmbjpDhyre9A8MoKafGyaTyejIInmCiisRERGRPMzTxYERrcsytHlpVh2NYNaOUPaHR7P04BWWHrxClWKu9A/0p3M1HxxsrY2OK5Kr6ZkrERERkXzAzsaKrtWL8ecbDflraCN61CqOnY0VRy7H8N7vhwmcsIEvVp/k0q0Eo6OK5FoqrkRERETymSrFXfmqRzV2jmzJv9qVp5ibI7cSkpi++RxNvtzEa7P3suPsdcxmvTNLxBIaFigiIiKST7kXsGNws1K81qQk609EMTsklO1nb7D2eBRrj0dR2qMgAwL9eaZmcQrY68dGkX+ivyUiIiIi+Zy1lYm2lbxoW8mLM1G3mR0Sxh/7L3H2ahwfLz3Gl6tP8Wyt4vQP9Kdk0YJGxxXJsTQsUERERETSlPF0Zny3yuwc1ZIxnStSokgBbicmM3NHKC0mbqH/r7vZcCKKlFQNGRT5X7pzJSIiIiIZuDjY8mLDEgwIDCD47HVm7Qhl46mrbD19ja2nr+Hn7kS/+v70rO2Lq5Ot0XFFcgQVVyIiIiLyUFZWJpqULUqTskUJv5HA3F1hLNxzkfCbCfx75QkmrjtF9xrF6B8YQAVvF6PjihhKwwJFREREJFP8CjsxqkMFdo5syefPVKG8lzN3k1KZv/si7b/bRs8fQlhxOIKklFSjo4oYQneuRERERMQijnbW9K7rR686vuwJvcWskFBWH41kd+hNdofexNPFnufr+dOnrh9Fne2Njivy1Ki4EhEREZHHYjKZqFvCnbol3ImMuUvQrjCCdocTFZvIN+tOM2XjGTpW8aZv3eLolVmSH6i4EhEREZEn5uXqwIg25RjSojSrj0Yyc0coB8KjWXLwCksOXsG3gDWJPpfpWsMXB1tro+OKZAs9cyUiIiIiWcbexpqu1Yux+I2GLBvakGdrFsfOxoqL8Sb+9ecxGny+kS9Xn+Ry9B2jo4pkORVXIiIiIpItqhZ3Y2LPamx9twmd/FLwdnXgZvw9vt98jsZfbOT1OfvYce46Zo0ZlDxCwwJFREREJFsVLmBH62JmvnqpEVvO3mJ2SCg7zt1g9bFIVh+LpKxnQfoHBtC9RjEK2OvHU8m91HtFRERE5KmwsbaiXWUv2lX24nTUbWaHhPLn/sucjorjoyVH+WLVSZ6rXZz+gQGUKFLA6LgiFtOwQBERERF56sp6OvNptyrsHNWS0Z0qUqJIAW4nJjNjeyjNv97MgF93s/FkFKmpGjIouYfuXImIiIiIYVwcbHmpUQkGNghg29nrzN4RysZTV9ly+hpbTl/Dz92J/oH+9Kjli6uTrdFxRR5JxZWIiIiIGM7KykTTskVpWrYoYTfimbszjIV7LhJ+M4FPV5xg4trTdKtRjAEN/Cnv5WJ0XJEH0rBAEREREclR/AsX4MOOFdk5qiUTnqlCeS9n7iSlMH93OO0mbaPXf0JYeSSCpJRUo6OKpKM7VyIiIiKSIznZ2dCnrh+96/iy+8JNZoeEsfpYJLsu3GTXhZt4uTjwQn0/etf1o0hBe6Pjiqi4EhEREZGczWQyUa9kYeqVLExEzB2CdoUzf3c4kbF3+XrtaSZvOEvHqt4MaBBAdV83o+NKPqbiSkRERERyDW9XR95pU46hLUqz6kgkM3eEcvBiNIsPXGbxgctUK+5K/8AAOlb1xsHW2ui4ks/omSsRERERyXXsbazpVqMYS4Y0ZOmQhjxbszh21lYcuhTDO4sO0fDzjXy15iRXou8YHVXyERVXIiIiIpKrVfN1Y2LPaoSMbMF7bcvh4+rAjfh7TNt0jsZfbmLw3H2EnLuB2ax3Zkn20rBAEREREckTChe0Z0jz0gxqUpL1J6KYtSOMkPM3WHU0klVHIynn6Uz/Bv50r1EMJzv9GCxZz9A7V1u3bqVz5874+PhgMplYsmTJI9v/+eeftG7dmqJFi+Li4kJgYCBr1qxJ12bs2LGYTKZ0n/Lly2fjVYiIiIhITmJjbUW7yt7Mf60+a95qwvP1/HC0teZU1G0+XHyUep9tYPzy44Rejzc6quQxhhZX8fHxVKtWjWnTpmWq/datW2ndujUrV65k3759NG/enM6dO3PgwIF07SpVqkRERETaJzg4ODvii4iIiEgOV87LmX93r8LOUS35uFNFAgo7cftuMr8EX6DZ15sZOGM3m05dJTVVQwblyRl6P7R9+/a0b98+0+0nTZqUbvmzzz5j6dKl/PXXX9SoUSNtvY2NDV5eXlkVU0RERERyOVdHW15uVIIXGwSw9cw1Zu0IZfPpa2w+9ffHv7AT/er706O2L66OtkbHlVwqVw82TU1N5fbt27i7u6dbf+bMGXx8fHBwcCAwMJAJEybg5+f30OMkJiaSmJiYthwbGwtAUlISSUlJ2RM+k+6f3+gcknuoz4il1GfEUuozYqmc1mcalixEw5KFCLuRQNDuiyzaf5mwGwl8uuIEE9eeomt1H16o60s5L2ejo+ZbOanPWJLBZM4h06aYTCYWL15Mt27dMr3Pl19+yeeff87Jkyfx8PAAYNWqVcTFxVGuXDkiIiIYN24cly9f5ujRozg7P/gvyNixYxk3blyG9UFBQTg5OT3W9YiIiIhI7pCYAvuum9gaYUXEHVPa+tIuqTT2MlPF3Yy16REHkDwtISGBvn37EhMTg4uLyyPb5triKigoiFdffZWlS5fSqlWrh7aLjo7G39+fb775hpdffvmBbR5058rX15fr16//4xcwuyUlJbFu3Tpat26Nra1uUcs/U58RS6nPiKXUZ8RSuaXPmM1mdofeYu6ui6w7cZWU//8clpeLPX3r+tKzVjEKF7Q3OGX+kJP6TGxsLEWKFMlUcZUrhwUuWLCAV155hUWLFj2ysAJwc3OjbNmynD179qFt7O3tsbfP+BfF1tbW8G/mfTkpi+QO6jNiKfUZsZT6jFgqN/SZRmU9aVTWk4iYO8zbGc783eFExibyzfqzTN10nk7VvBkQGEA1Xzejo+YLOaHPWHL+XPcS4fnz5/Piiy8yf/58Onbs+I/t4+LiOHfuHN7e3k8hnYiIiIjkBd6ujrzbthw7Rrbgm57VqObrxr2UVP7cf5mu07bTddp2Fh+4RGJyitFRJQcx9M5VXFxcujtKFy5c4ODBg7i7u+Pn58fIkSO5fPkys2fPBv4eCjhgwAC+++476tWrR2RkJACOjo64uroC8O6779K5c2f8/f25cuUKY8aMwdramj59+jz9CxQRERGRXM3exppnahbnmZrFOXgxmtk7Qll+OIJDF6N5e2E0ny4/QZ+6fjxf3w9vV0ej44rBDL1ztXfvXmrUqJE2jfqIESOoUaMGo0ePBiAiIoLw8PC09j/++CPJyckMGTIEb2/vtM+bb76Z1ubSpUv06dOHcuXK0bNnTwoXLszOnTspWrTo0704EREREclTqvu68U2v6uwY2YL32pbD29WBG/H3mLrpLI2+2MQb8/ax8/wNcsiUBmIAQ+9cNWvW7JGdb+bMmemWN2/e/I/HXLBgwROmEhERERF5uCIF7RnSvDSDmpRk3fEoZoWEsvP8TVYeiWTlkUjKeznTPzCAbjV8cLLLlVMcyGPSd1tERERE5DHYWFvRvoo37at4czIyltkhYSzef5mTkbcZtfgIn686Qc/avvQL9Me/cAGj48pTkOsmtBARERERyWnKe7nwWfcq7BzVko86VsC/sBOxd5P5OfgCzb7ezEsz97D51FVSUzVkMC/TnSsRERERkSzi6mjLK41L8lLDEmw5fY1ZIaFsPnWNjSevsvHkVQIKO9EvMIAetYvj4pCzp6UXy6m4EhERERHJYlZWJpqX96B5eQ8uXI9nTkgYi/ZdJPRGAuOXH2fi2lN0r1GM/oEBlPNyNjquZBENCxQRERERyUYlihRgdOeK7BzZkn93r0xZz4Ik3Eth3q5w2k7aSp8fd7L6aATJKalGR5UnpDtXIiIiIiJPQQF7G56v50/fun7sPH+T2SGhrD0eRcj5G4Scv4GPqwPP1/endx1fChe0NzquPAYVVyIiIiIiT5HJZCKwVGECSxXmSvQd5u0KY/7ui1yJuctXa07x3YYzdK7qw4AG/lQt7mZ0XLGAiisREREREYP4uDnyXtvyDGtRhhWHI5gVEsrhSzH8sf8Sf+y/RHVfNwY2CKB9FS/sbayNjiv/QM9ciYiIiIgYzMHWmmdrFWfZ0EYsGdKQ7jWKYWtt4uDFaN5aeJCGn29k4tpTRMbcNTqqPIKKKxERERGRHKS6rxvf9qrOjg9a8k7rsni5OHA97h5TNp6l4RcbGTJvP7vO38Bs1juzchoNCxQRERERyYGKOtszrGUZXm9WinXHo5i5I5TdF26y4kgEK45EUN7LmQENAuha3QcnO/1YnxPouyAiIiIikoPZWlvRoYo3Hap4cyIiltkhYSw+cImTkbcZ+ecRJqw8Qa86vvSrH4BfYSej4+ZrGhYoIiIiIpJLVPB2YcIzVdg1shUfdayAn7sTsXeT+WnbBZp+vYmXZ+5hy+lrpKZqyKARdOdKRERERCSXcXWy5ZXGJXmpYQk2n77KrB1hbDl9jQ0nr7Lh5FVKFilAv0B/nq1VHBcHW6Pj5hsqrkREREREcikrKxMtynvSorwnF67HMzsklN/3XuL89XjG/XWcr9ac4pmaxRgQGEAZT2ej4+Z5GhYoIiIiIpIHlChSgDGdK7FzVEs+7VaZMh4FSbiXwtyd4bT+dit9f9rJ6qORJKekGh01z9KdKxERERGRPKSAvQ0v1Pfn+Xp+hJy/wawdoaw7HsWOczfYce4Gxdwceb6+H73r+OFewM7ouHmKiisRERERkTzIZDLRoFQRGpQqwuXoO8zbGcaCPRe5HH2HL1efYtL6M3Sp5sOAwACqFHc1Om6eoOJKRERERCSPK+bmyPvtyjO8ZRmWH45g1o5QjlyO4fd9l/h93yVq+rkxoEEA7St7Y2ejJ4cel4orEREREZF8wsHWmudqFefZmsU4cDGa2TtCWXEkgv3h0ewPP8j4gifoW8+P5+v54eniYHTcXEfFlYiIiIhIPmP6f+3de1RU9doH8O9wmQEVBDMuoyMjgoKCKJQEpoiiHOOQ+trrNTSPZiX6huSF1ELlaJ6Tmp6iOhqFnrdCS3OVGl4QjkoIimBeEC+ImoJXjJvK7ff+0WLeJkDZnIE9yPez1l7L+e3nt/ez8VnI4977h0IB72628O5mi8UhvZGQcQX/m34ZN4of4h9J5/Fx8gUEezjgFX8tnnGyhUKhkDvlVoHNFRERERFRG/a0lQpzhrni9SE9sPf0DWxKy0fGpbvY9XMBdv1cAHdHa0z1c8Kofl1gqTSVO12jxuaKiIiIiIhgbmqCkL6OCOnriDPXi/GvI/n4LusacgqKEbX9JN778SzGP6vBy75O6PZUO7nTNUp8W42IiIiIiPT0Vlvjvf/qiyNvD8PiF9yh6WSJX+9XYsPBPASsTsaMTUdx8Nwt1NQIuVM1KrxzRURERERE9bJpp8Srg53xl+e7IyX3JjalXcbBc7ewP+cm9ufchHPn9pji54SxPl1hZWEud7qyY3NFRERERESPZGqiwDB3ewxzt0ferVJsTruMbzN/Qd7tMiz94Qze35OLsT5dMcXPCS52VnKnKxs+FkhERERERI3m/HQHLH2xD44sGoaY0R5wteuAsopqbE67jKC1BzH5syPYe7oQ1W3wkUHeuSIiIiIiIsk6qMwQ9pwTXvbthrSLd7ApLR/7ztxA6oU7SL1wB11sLPHyc06Y8KwGtu2VcqfbIthcERERERFRkykUCvi7dIa/S2f8UlSOL9OvICHjCq7du4+/JZ7Fuv3n8KKXGlP9tfDo0lHudJsVmysiIiIiIjKIrrbtsPBPbnhzmCt+OHEdm9LycepaMb7J/AXfZP4CHydbTPFzwkgPRyjNnrw3lNhcERERERGRQVmYm+K/n9HgJZ+uOH7lHjan5WP3yQJkXi5C5uUi/NUqB5MGdMMk326wt7bQm1tdI5B+6S4ybyvw1KW78HOxg6mJQqYrkYbNFRERERERNQuFQgEfJ1v4ONlicYg7vk6/ii/TL+NmyUOsTzqP2OQLGOnpiKl+TvBxssWe04VY9sMZFPz6AIApNp8/BseOFogO7Y0/eTjKfTmPxeaKiIiIiIianZ2VBd4McsWswB7Yc7oQm37Kx9H8Ivxw4jp+OHEdXW0s8cu9+3XmFf76AG/873F88rK30TdYT96DjkREREREZLTMTU3w575qfPO6P3b9z/OY8KwGKjNFvY0VANQu6L7shzNGv7w7mysiIiIiIpJFH3VHrBrbFx9N8n5knABQ8OsDZFy62zKJNRGbKyIiIiIiklV5RXWj4m6WPGjmTP4zbK6IiIiIiEhWdlYWjw+SECcXNldERERERCSrAd07wbGjBRpacF0BwLGjBQZ079SSaUnG5oqIiIiIiGRlaqJAdGhvAKjTYNV+jg7tbfS/74rNFRERERERye5PHo745GVvOHTUf/TPoaNFq1iGHeDvuSIiIiIiIiPxJw9HDO/tgLQLN7H3UDpGDPKFn4ud0d+xqsXmioiIiIiIjIapiQK+3TvhTo6Ab/dOraaxAvhYIBERERERkUGwuSIiIiIiIjIANldEREREREQGIGtzdfDgQYSGhkKtVkOhUGDHjh2PnZOSkgJvb2+oVCq4uLggPj6+TkxsbCy0Wi0sLCzg6+uLjIwMwydPRERERET0O7I2V2VlZfDy8kJsbGyj4i9duoSQkBAEBgYiOzsbERERmDFjBvbs2aOL2bJlCyIjIxEdHY3jx4/Dy8sLwcHBuHnzZnNdBhERERERkbyrBY4cORIjR45sdPynn36K7t27Y82aNQAAd3d3HD58GB988AGCg4MBAGvXrsWrr76KadOm6ebs2rULn3/+OaKiogx/EURERERERGhlS7GnpaUhKChIbyw4OBgREREAgIqKCmRmZuLtt9/W7TcxMUFQUBDS0tIaPO7Dhw/x8OFD3efi4mIAQGVlJSorKw14BdLVnl/uPKj1YM2QVKwZkoo1Q1KxZkgqY6oZKTm0quaqsLAQ9vb2emP29vYoLi7G/fv3UVRUhOrq6npjzp492+Bx33vvPSxbtqzO+N69e9GuXTvDJP8f2rdvn9wpUCvDmiGpWDMkFWuGpGLNkFTGUDPl5eWNjm1VzVVzefvttxEZGan7XFxcDI1GgxEjRsDa2lrGzH7rlPft24fhw4fD3Nxc1lyodWDNkFSsGZKKNUNSsWZIKmOqmdqn2hqjVTVXDg4OuHHjht7YjRs3YG1tDUtLS5iamsLU1LTeGAcHhwaPq1KpoFKp6oybm5vL/pdZy5hyodaBNUNSsWZIKtYMScWaIamMoWaknL9V/Z4rPz8/JCUl6Y3t27cPfn5+AAClUgkfHx+9mJqaGiQlJeliiIiIiIiImoOszVVpaSmys7ORnZ0N4Lel1rOzs3HlyhUAvz2uN2XKFF3866+/jry8PCxYsABnz57Fxx9/jK1bt2Lu3Lm6mMjISGzcuBGbNm1CTk4O3njjDZSVlelWDyQiIiIiImoOsj4WeOzYMQQGBuo+1773NHXqVMTHx6OgoEDXaAFA9+7dsWvXLsydOxfr169H165d8dlnn+mWYQeA8ePH49atW3j33XdRWFiIfv36ITExsc4iF0RERERERIYka3M1ZMgQCCEa3B8fH1/vnKysrEced/bs2Zg9e3aT86rNScrLa82lsrIS5eXlKC4ulv15U2odWDMkFWuGpGLNkFSsGZLKmGqmtid4VN9Sq1UtaNFSSkpKAAAajUbmTIiIiIiIyBiUlJSgY8eOj4xRiMa0YG1MTU0Nrl+/DisrKygUCllzqV0W/urVq7IvC0+tA2uGpGLNkFSsGZKKNUNSGVPNCCFQUlICtVoNE5NHL1nBO1f1MDExQdeuXeVOQ4+1tbXshUWtC2uGpGLNkFSsGZKKNUNSGUvNPO6OVa1WtRQ7ERERERGRsWJzRUREREREZABsroycSqVCdHQ0VCqV3KlQK8GaIalYMyQVa4akYs2QVK21ZrigBRERERERkQHwzhUREREREZEBsLkiIiIiIiIyADZXREREREREBsDmioiIiIiIyADYXBmB2NhYaLVaWFhYwNfXFxkZGY+M/+abb+Dm5gYLCwt4enpi9+7dLZQpGQspNbNx40YMGjQItra2sLW1RVBQ0GNrjJ48Ur/P1EpISIBCocDo0aObN0EyOlJr5t69ewgPD4ejoyNUKhV69uzJf5/aGKk1s27dOvTq1QuWlpbQaDSYO3cuHjx40ELZkpwOHjyI0NBQqNVqKBQK7Nix47FzUlJS4O3tDZVKBRcXF8THxzd7nk3B5kpmW7ZsQWRkJKKjo3H8+HF4eXkhODgYN2/erDf+p59+wsSJEzF9+nRkZWVh9OjRGD16NE6dOtXCmZNcpNZMSkoKJk6ciOTkZKSlpUGj0WDEiBG4du1aC2dOcpFaM7Xy8/Mxb948DBo0qIUyJWMhtWYqKiowfPhw5Ofn49tvv0Vubi42btyILl26tHDmJBepNfPVV18hKioK0dHRyMnJQVxcHLZs2YJFixa1cOYkh7KyMnh5eSE2NrZR8ZcuXUJISAgCAwORnZ2NiIgIzJgxA3v27GnmTJtAkKwGDBggwsPDdZ+rq6uFWq0W7733Xr3x48aNEyEhIXpjvr6+4rXXXmvWPMl4SK2ZP6qqqhJWVlZi06ZNzZUiGZmm1ExVVZXw9/cXn332mZg6daoYNWpUC2RKxkJqzXzyySfC2dlZVFRUtFSKZGSk1kx4eLgYOnSo3lhkZKQYOHBgs+ZJxgeA+O677x4Zs2DBAtGnTx+9sfHjx4vg4OBmzKxpeOdKRhUVFcjMzERQUJBuzMTEBEFBQUhLS6t3Tlpaml48AAQHBzcYT0+WptTMH5WXl6OyshKdOnVqrjTJiDS1ZpYvXw47OztMnz69JdIkI9KUmvn+++/h5+eH8PBw2Nvbw8PDAytXrkR1dXVLpU0yakrN+Pv7IzMzU/foYF5eHnbv3o0XXnihRXKm1qU1/fxrJncCbdnt27dRXV0Ne3t7vXF7e3ucPXu23jmFhYX1xhcWFjZbnmQ8mlIzf7Rw4UKo1eo636ToydSUmjl8+DDi4uKQnZ3dAhmSsWlKzeTl5eHAgQOYPHkydu/ejQsXLmDWrFmorKxEdHR0S6RNMmpKzUyaNAm3b9/G888/DyEEqqqq8Prrr/OxQKpXQz//FhcX4/79+7C0tJQps7p454qoDVm1ahUSEhLw3XffwcLCQu50yAiVlJQgLCwMGzduROfOneVOh1qJmpoa2NnZYcOGDfDx8cH48eOxePFifPrpp3KnRkYqJSUFK1euxMcff4zjx49j+/bt2LVrF2JiYuROjeg/wjtXMurcuTNMTU1x48YNvfEbN27AwcGh3jkODg6S4unJ0pSaqbV69WqsWrUK+/fvR9++fZszTTIiUmvm4sWLyM/PR2hoqG6spqYGAGBmZobc3Fz06NGjeZMmWTXl+4yjoyPMzc1hamqqG3N3d0dhYSEqKiqgVCqbNWeSV1Nq5p133kFYWBhmzJgBAPD09ERZWRlmzpyJxYsXw8SE//9P/6+hn3+tra2N6q4VwDtXslIqlfDx8UFSUpJurKamBklJSfDz86t3jp+fn148AOzbt6/BeHqyNKVmAODvf/87YmJikJiYiGeeeaYlUiUjIbVm3NzccPLkSWRnZ+u2F198UbdCk0ajacn0SQZN+T4zcOBAXLhwQdeIA8C5c+fg6OjIxqoNaErNlJeX12mgaptzIUTzJUutUqv6+VfuFTXauoSEBKFSqUR8fLw4c+aMmDlzprCxsRGFhYVCCCHCwsJEVFSULj41NVWYmZmJ1atXi5ycHBEdHS3Mzc3FyZMn5boEamFSa2bVqlVCqVSKb7/9VhQUFOi2kpISuS6BWpjUmvkjrhbY9kitmStXrggrKysxe/ZskZubK3bu3Cns7OzEX//6V7kugVqY1JqJjo4WVlZW4uuvvxZ5eXli7969okePHmLcuHFyXQK1oJKSEpGVlSWysrIEALF27VqRlZUlLl++LIQQIioqSoSFheni8/LyRLt27cT8+fNFTk6OiI2NFaampiIxMVGuS2gQmysj8OGHH4pu3boJpVIpBgwYII4cOaLbFxAQIKZOnaoXv3XrVtGzZ0+hVCpFnz59xK5du1o4Y5KblJpxcnISAOps0dHRLZ84yUbq95nfY3PVNkmtmZ9++kn4+voKlUolnJ2dxYoVK0RVVVULZ01yklIzlZWVYunSpaJHjx7CwsJCaDQaMWvWLFFUVNTyiVOLS05Orvdnk9oamTp1qggICKgzp1+/fkKpVApnZ2fxxRdftHjejaEQgvdeiYiIiIiI/lN854qIiIiIiMgA2FwREREREREZAJsrIiIiIiIiA2BzRUREREREZABsroiIiIiIiAyAzRUREREREZEBsLkiIiIiIiIyADZXREREREREBsDmioiIGpSSkgKFQoF79+41es7SpUvRr1+/Zsnnzp07sLOzQ35+frMcv9aQIUMQEREhaY5CocCOHTsMcv4dO3bAxcUFpqamkvNoCRUVFdBqtTh27JjcqRARGRU2V0REbVxaWhpMTU0REhIidyqPtWLFCowaNQparRYA4OjoiFWrVunFREVFQaFQICUlRW98yJAhCAsLa9R5tm/fjpiYGEOkrCOlUX3ttdfw0ksv4erVqwbPAwAKCwsxZ84cODs7Q6VSQaPRIDQ0FElJSboYrVYLhUKht3Xt2hUAoFQqMW/ePCxcuNDguRERtWZsroiI2ri4uDjMmTMHBw8exPXr1+VOp0Hl5eWIi4vD9OnTdWNDhgyp00QlJydDo9HojT948ABHjhzB0KFDG3WuTp06wcrKyhBpS1ZaWoqbN28iODgYarW6yXlUVFTUO56fnw8fHx8cOHAA77//Pk6ePInExEQEBgYiPDxcL3b58uUoKCjQbVlZWbp9kydPxuHDh3H69Okm5UdE9CRic0VE1IaVlpZiy5YteOONNxASEoL4+PhHxsfHx8PGxgY7duyAq6srLCwsEBwcjKtXr9aJ/de//gWtVouOHTtiwoQJKCkp0e1LTEzE888/DxsbGzz11FP485//jIsXLz7y3Lt374ZKpcJzzz2nGwsMDERqaiqqqqoAACUlJcjKysLChQv1mqu0tDQ8fPgQgYGBAIBTp05h5MiR6NChA+zt7REWFobbt2/r4v/4WGBBQQFCQkJgaWmJ7t2746uvvoJWq8W6dev0crx9+zbGjBmDdu3awdXVFd9//z2A3xqa2nPb2tpCoVDglVdeqXONKSkpumZq6NChenfgtm3bhj59+kClUkGr1WLNmjV6c7VaLWJiYjBlyhRYW1tj5syZ9X4dZ82aBYVCgYyMDIwdOxY9e/ZEnz59EBkZiSNHjujFWllZwcHBQbc9/fTTun22trYYOHAgEhIS6j0PEVFbxOaKiKgN27p1K9zc3NCrVy+8/PLL+PzzzyGEeOSc8vJyrFixAps3b0Zqairu3buHCRMm6MVcvHgRO3bswM6dO7Fz5078+9//1nt8r6ysDJGRkTh27BiSkpJgYmKCMWPGoKampsHzHjp0CD4+PnpjgYGBKC0txdGjR3UxPXv2xNixY5Geno4HDx4A+O1ullarhVarxb179zB06FD0798fx44dQ2JiIm7cuIFx48Y1eO4pU6bg+vXrSElJwbZt27BhwwbcvHmzTtyyZcswbtw4/Pzzz3jhhRcwefJk3L17FxqNBtu2bQMA5ObmoqCgAOvXr68z39/fH7m5uQB+a6YKCgrg7++PzMxMjBs3DhMmTMDJkyexdOlSvPPOO3Wa4dWrV8PLywtZWVl455136hz/7t27SExMRHh4ONq3b19nv42NTYNfg/oMGDAAhw4dkjSHiOiJJoiIqM3y9/cX69atE0IIUVlZKTp37iySk5N1+5OTkwUAUVRUJIQQ4osvvhAAxJEjR3QxOTk5AoBIT08XQggRHR0t2rVrJ4qLi3Ux8+fPF76+vg3mcevWLQFAnDx5ssGYUaNGib/85S91xrt06SJWrlypO8+sWbOEEEL07NlTHDhwQAghxKBBg8S0adOEEELExMSIESNG6B3j6tWrAoDIzc0VQggREBAg3nzzTb3rO3r0qC7+/PnzAoD44IMPdGMAxJIlS3SfS0tLBQDx448/CiHqfi0bUlRUJADo/T1MmjRJDB8+XC9u/vz5onfv3rrPTk5OYvTo0Y88dnp6ugAgtm/f/si42uMplUrRvn173bZ+/Xq9mPXr1wutVvvYYxERtRW8c0VE1Ebl5uYiIyMDEydOBACYmZlh/PjxiIuLe+Q8MzMzPPvss7rPbm5usLGxQU5Ojm5Mq9XqvSvk6Oiod6fn/PnzmDhxIpydnWFtba1boOLKlSsNnvf+/fuwsLCoM/77965SUlIwZMgQAEBAQABSUlJw//59pKen6x7LO3HiBJKTk9GhQwfd5ubmBgD1PpqYm5sLMzMzeHt768ZcXFxga2tbJ7Zv3766P7dv3x7W1tb13uGSKicnBwMHDtQbGzhwIM6fP4/q6mrd2DPPPPPI44jH3JX8o/nz5yM7O1u3TZkyRW+/paUlysvLJR2TiOhJZiZ3AkREJI+4uDhUVVVBrVbrxoQQUKlU+Oijj9CxY8cmH9vc3Fzvs0Kh0HvkLzQ0FE5OTti4cSPUajVqamrg4eHR4CIMANC5c2cUFRXVGQ8MDMSbb76JO3fuICsrCwEBAQB+a67++c9/YvDgwaioqNAtZlFaWorQ0FD87W9/q3MsR0fHJl1vrcddd3Or71G/33N1dYVCocDZs2cbdbzOnTvDxcWlwf13797Vew+LiKit450rIqI2qKqqCps3b8aaNWv07kycOHECarUaX3/99SPn/v73G+Xm5uLevXtwd3dv1Lnv3LmD3NxcLFmyBMOGDYO7u3u9TdMf9e/fH2fOnKkzHhgYiLKyMqxduxaurq6ws7MDAAwePBgZGRn48ccf4erqii5dugAAvL29cfr0aWi1Wri4uOht9TUnvXr1QlVVld5KeRcuXGhUzr+nVCoBQO9OU2O5u7sjNTVVbyw1NRU9e/aEqalpo4/TqVMnBAcHIzY2FmVlZXX2S/l9ZsBvC4P0799f0hwioicZmysiojZo586dKCoqwvTp0+Hh4aG3jR079pGPBpqbm2POnDlIT09HZmYmXnnlFTz33HMYMGBAo85ta2uLp556Chs2bMCFCxdw4MABREZGPnZecHAwTp8+XaepcXZ2Rrdu3fDhhx/q7loBgEajgVqtxoYNG3SPBAJAeHg47t69i4kTJ+Lo0aO4ePEi9uzZg2nTptXb+Li5uSEoKAgzZ85ERkYGsrKyMHPmTFhaWkKhUDTqmgHAyckJCoUCO3fuxK1bt1BaWtrouW+99RaSkpIQExODc+fOYdOmTfjoo48wb968Rh+jVmxsLKqrqzFgwABs27YN58+fR05ODv7xj3/Az89P0rEOHTqEESNGSM6BiOhJxeaKiKgNiouLQ1BQUL2P/o0dOxbHjh3Dzz//XO/cdu3aYeHChZg0aRIGDhyIDh06YMuWLY0+t4mJCRISEpCZmQkPDw/MnTsX77///mPneXp6wtvbG1u3bq2zLzAwECUlJbr3rWoFBASgpKREr7lSq9VITU1FdXU1RowYAU9PT0RERMDGxgYmJvX/s7h582bY29tj8ODBGDNmDF599VVYWVnV+w5YQ7p06YJly5YhKioK9vb2mD17dqPn1l53QkICPDw88O6772L58uX1Luf+OM7Ozjh+/DgCAwPx1ltvwcPDA8OHD0dSUhI++eSTRh8nLS0Nv/76K1566SXJORARPakUQurbrURE1GbFx8cjIiJC8uNjhrJr1y7Mnz8fp06darARagm//PILNBoN9u/fj2HDhsmWh5zGjx8PLy8vLFq0SO5UiIiMBhe0ICKiViMkJATnz5/HtWvXoNFoWuy8Bw4cQGlpKTw9PVFQUIAFCxZAq9Vi8ODBLZaDMamoqICnpyfmzp0rdypEREaFzRUREbUqERERLX7OyspKLFq0CHl5ebCysoK/vz++/PLLOqsDthVKpRJLliyROw0iIqPDxwKJiIiIiIgMgAtaEBERERERGQCbKyIiIiIiIgNgc0VERERERGQAbK6IiIiIiIgMgM0VERERERGRAbC5IiIiIiIiMgA2V0RERERERAbA5oqIiIiIiMgA/g9Fv5Dyt0eaoAAAAABJRU5ErkJggg==\n"
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ba3de877",
+ "outputId": "3837ad0a-0200-4f83-96ca-2bb17907776a"
+ },
+ "source": [
+ "from surprise.model_selection import GridSearchCV\n",
+ "from surprise import SVD\n",
+ "from surprise import Dataset, Reader\n",
+ "import pandas as pd\n",
+ "from surprise.model_selection import train_test_split\n",
+ "from surprise import accuracy\n",
+ "\n",
+ "# Assuming 'data' (Surprise Dataset) is available from earlier steps.\n",
+ "# If not, you might need to reload or recreate it:\n",
+ "# reader = Reader(rating_scale=(1, 5))\n",
+ "# data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)\n",
+ "\n",
+ "\n",
+ "# Define the parameter grid to search\n",
+ "param_grid = {\n",
+ " 'n_factors': [50, 100, 150], # Number of latent factors\n",
+ " 'lr_all': [0.002, 0.005, 0.01], # Learning rate for all parameters\n",
+ " 'reg_all': [0.02, 0.05, 0.1] # Regularization term for all parameters\n",
+ "}\n",
+ "\n",
+ "# Use GridSearchCV to find the best parameters\n",
+ "gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) # Using 3-fold cross-validation\n",
+ "\n",
+ "# Fit the grid search to the data\n",
+ "gs.fit(data) # 'data' is the Surprise Dataset loaded earlier\n",
+ "\n",
+ "# Print the best RMSE score and the corresponding parameters\n",
+ "print(\"Best RMSE score:\", gs.best_score['rmse'])\n",
+ "print(\"Best parameters:\", gs.best_params['rmse'])\n",
+ "\n",
+ "# Train the SVD model with the best parameters found on the full dataset\n",
+ "svd_mf_tuned = SVD(**gs.best_params['rmse'])\n",
+ "svd_mf_tuned.fit(data.build_full_trainset())\n",
+ "\n",
+ "# Optional: Evaluate this tuned model on the original testset if needed for comparison\n",
+ "predictions_tuned = svd_mf_tuned.test(testset)\n",
+ "rmse_tuned = accuracy.rmse(predictions_tuned)\n",
+ "print(f\"Tuned Matrix Factorization (SVD) RMSE on testset: {rmse_tuned}\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Best RMSE score: 0.8902395116656673\n",
+ "Best parameters: {'n_factors': 150, 'lr_all': 0.01, 'reg_all': 0.1}\n",
+ "RMSE: 0.7304\n",
+ "Tuned Matrix Factorization (SVD) RMSE on testset: 0.7303515538617539\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "df51a06d",
+ "outputId": "9cce979b-7f98-49f0-b7f3-cbf912d2ba96"
+ },
+ "source": [
+ "# Assuming 'best_alpha' is available from the previous tuning step\n",
+ "# If not, retrieve it from the rmse_scores dictionary: best_alpha = min(rmse_scores, key=rmse_scores.get)\n",
+ "\n",
+ "print(f\"Evaluating the hybrid model with the best alpha = {best_alpha:.2f}\")\n",
+ "\n",
+ "predictions_hybrid_testset = []\n",
+ "actual_ratings_testset = []\n",
+ "\n",
+ "# Calculate hybrid predictions for each user-movie pair in the testset_df\n",
+ "for index, row in testset_df.iterrows():\n",
+ " user_id = int(row['userId'])\n",
+ " movie_id = int(row['movieId'])\n",
+ " actual_rating = row['rating']\n",
+ "\n",
+ " # Get the hybrid predicted rating\n",
+ " predicted_rating = hybrid_prediction(\n",
+ " user_id,\n",
+ " movie_id,\n",
+ " best_alpha, # Use the best alpha found\n",
+ " get_content_based_score,\n",
+ " knn_user_based, # Trained k-NN model\n",
+ " svd_mf, # Trained SVD model (or svd_mf_tuned if you want to use the tuned one)\n",
+ " merged_df_with_tfidf, # Original merged data for filtering seen movies\n",
+ " unique_movies_reduced, # Unique movies with SVD features\n",
+ " user_profiles_train # User profiles created from training data\n",
+ " )\n",
+ "\n",
+ " predictions_hybrid_testset.append(predicted_rating)\n",
+ " actual_ratings_testset.append(actual_rating)\n",
+ "\n",
+ "# Calculate RMSE for the hybrid model on the testset\n",
+ "rmse_hybrid_testset = np.sqrt(mean_squared_error(actual_ratings_testset, predictions_hybrid_testset))\n",
+ "\n",
+ "print(f\"Hybrid Model (alpha={best_alpha:.2f}) RMSE on testset: {rmse_hybrid_testset:.4f}\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Evaluating the hybrid model with the best alpha = 1.00\n",
+ "Hybrid Model (alpha=1.00) RMSE on testset: 1.0638\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 801
+ },
+ "id": "e9a4cb90",
+ "outputId": "71cbee11-d00b-448d-e194-591a16ca2503"
+ },
+ "source": [
+ "def get_hybrid_recommendations(user_id, alpha, cb_model, cf_model1, cf_model2, merged_df_with_tfidf, unique_movies_reduced, user_profiles, top_n=10):\n",
+ " \"\"\"\n",
+ " Generates hybrid recommendations for a given user by blending content-based and collaborative filtering predictions.\n",
+ "\n",
+ " Args:\n",
+ " user_id (int): The ID of the user for whom to generate recommendations.\n",
+ " alpha (float): The blending weight (0 <= alpha <= 1).\n",
+ " cb_model (function): Function to get content-based score (get_content_based_score).\n",
+ " cf_model1 (surprise.prediction_algorithms.algo_base.AlgoBase): Trained Surprise CF model 1 (e.g., k-NN).\n",
+ " cf_model2 (surprise.prediction_algorithms.algo_base.AlgoBase): Trained Surprise CF model 2 (e.g., SVD).\n",
+ " merged_df_with_tfidf (DataFrame): Original DataFrame with movie data and user ratings.\n",
+ " unique_movies_reduced (DataFrame): DataFrame with unique movies and their SVD features.\n",
+ " user_profiles (dict): Dictionary of user profile vectors.\n",
+ " top_n (int): The number of recommendations to generate.\n",
+ "\n",
+ " Returns:\n",
+ " DataFrame: A DataFrame containing the top N recommended movies based on hybrid predictions.\n",
+ " \"\"\"\n",
+ " # Handle cold-start users: If the user has no profile (based on training data), fall back to popularity baseline\n",
+ " if user_id not in user_profiles or np.all(user_profiles[user_id] == 0):\n",
+ " print(f\"User {user_id} is a cold-start user. Falling back to popularity baseline.\")\n",
+ " # Assuming 'popular_movies_unique' from the popularity baseline is available\n",
+ " if 'popular_movies_unique' in globals():\n",
+ " return popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n)\n",
+ " else:\n",
+ " print(\"Popularity baseline not available. Cannot provide recommendations for cold-start user.\")\n",
+ " return pd.DataFrame()\n",
+ "\n",
+ " # Get a list of all unique movie IDs from unique_movies_reduced\n",
+ " all_movie_ids = unique_movies_reduced['movieId'].tolist()\n",
+ "\n",
+ " # Get the list of movies the user has already rated from the original merged DataFrame\n",
+ " rated_movie_ids = merged_df_with_tfidf[merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist()\n",
+ "\n",
+ " # Get the list of movies the user has NOT rated\n",
+ " movies_to_predict = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]\n",
+ "\n",
+ " # Calculate hybrid predictions for the unrated movies\n",
+ " hybrid_predictions = []\n",
+ " for movie_id in movies_to_predict:\n",
+ " predicted_rating = hybrid_prediction(\n",
+ " user_id,\n",
+ " movie_id,\n",
+ " alpha,\n",
+ " get_content_based_score,\n",
+ " cf_model1,\n",
+ " cf_model2,\n",
+ " merged_df_with_tfidf,\n",
+ " unique_movies_reduced,\n",
+ " user_profiles\n",
+ " )\n",
+ " hybrid_predictions.append((movie_id, predicted_rating))\n",
+ "\n",
+ " # Sort predictions by predicted rating in descending order\n",
+ " hybrid_predictions.sort(key=lambda x: x[1], reverse=True)\n",
+ "\n",
+ " # Get the top N movie IDs\n",
+ " top_n_movie_ids = [movie[0] for movie in hybrid_predictions[:top_n]]\n",
+ "\n",
+ " # Retrieve the movie details for the top N recommendations from unique_movies_reduced\n",
+ " # Include 'movieId' here\n",
+ " recommended_movies = unique_movies_reduced[unique_movies_reduced['movieId'].isin(top_n_movie_ids)][['movieId', 'title', 'vote_average', 'vote_count', 'release_year']].copy()\n",
+ "\n",
+ " # Add the predicted rating to the recommendations DataFrame (optional, but useful for inspection)\n",
+ " # Create a dictionary from the top_n_movie_ids and their predicted ratings\n",
+ " predicted_ratings_dict = dict(hybrid_predictions[:top_n])\n",
+ " recommended_movies['predicted_rating'] = recommended_movies['movieId'].map(predicted_ratings_dict)\n",
+ "\n",
+ " # Sort by predicted rating to match the order of hybrid_predictions\n",
+ " recommended_movies = recommended_movies.sort_values('predicted_rating', ascending=False).reset_index(drop=True)\n",
+ "\n",
+ " return recommended_movies\n",
+ "\n",
+ "# Example usage: Get hybrid recommendations for a sample user (e.g., user ID 1)\n",
+ "sample_user_id_for_recommendations = 1\n",
+ "\n",
+ "# Assuming 'best_alpha', 'knn_user_based', 'svd_mf' (or 'svd_mf_tuned'),\n",
+ "# 'merged_df_with_tfidf', 'unique_movies_reduced', and 'user_profiles_train' are available\n",
+ "\n",
+ "hybrid_recommendations = get_hybrid_recommendations(\n",
+ " sample_user_id_for_recommendations,\n",
+ " best_alpha, # Use the best alpha found during tuning\n",
+ " get_content_based_score,\n",
+ " knn_user_based, # Or svd_mf_tuned if you prefer the tuned SVD\n",
+ " svd_mf, # You can use both CF models in the blend or choose the better one\n",
+ " merged_df_with_tfidf,\n",
+ " unique_movies_reduced,\n",
+ " user_profiles_train, # Use user profiles trained on the training data\n",
+ " top_n=10\n",
+ ")\n",
+ "\n",
+ "print(f\"\\nHybrid Recommendations for User {sample_user_id_for_recommendations} (using alpha={best_alpha:.2f}):\")\n",
+ "display(hybrid_recommendations)\n",
+ "\n",
+ "# Example usage for a cold-start user (assuming a user ID that doesn't exist in user_profiles_train)\n",
+ "cold_start_user_id_for_recommendations = 9999 # Replace with a user ID not in user_profiles_train\n",
+ "cold_start_hybrid_recommendations = get_hybrid_recommendations(\n",
+ " cold_start_user_id_for_recommendations,\n",
+ " best_alpha,\n",
+ " get_content_based_score,\n",
+ " knn_user_based,\n",
+ " svd_mf,\n",
+ " merged_df_with_tfidf,\n",
+ " unique_movies_reduced,\n",
+ " user_profiles_train,\n",
+ " top_n=10\n",
+ ")\n",
+ "\n",
+ "print(f\"\\nHybrid Recommendations for Cold-Start User {cold_start_user_id_for_recommendations}:\")\n",
+ "display(cold_start_hybrid_recommendations)"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "Hybrid Recommendations for User 1 (using alpha=1.00):\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " movieId title vote_average vote_count \\\n",
+ "0 1 Toy Story 7.7 5415.0 \n",
+ "1 2 Jumanji 6.9 2413.0 \n",
+ "2 3 Grumpier Old Men 6.5 92.0 \n",
+ "3 4 Waiting to Exhale 6.1 34.0 \n",
+ "4 5 Father of the Bride Part II 5.7 173.0 \n",
+ "5 6 Heat 7.7 1886.0 \n",
+ "6 7 Sabrina 6.2 141.0 \n",
+ "7 8 Tom and Huck 5.4 45.0 \n",
+ "8 9 Sudden Death 5.5 174.0 \n",
+ "9 10 GoldenEye 6.6 1194.0 \n",
+ "\n",
+ " release_year predicted_rating \n",
+ "0 1995 3.545905 \n",
+ "1 1995 3.545905 \n",
+ "2 1995 3.545905 \n",
+ "3 1995 3.545905 \n",
+ "4 1995 3.545905 \n",
+ "5 1995 3.545905 \n",
+ "6 1995 3.545905 \n",
+ "7 1995 3.545905 \n",
+ "8 1995 3.545905 \n",
+ "9 1995 3.545905 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " movieId \n",
+ " title \n",
+ " vote_average \n",
+ " vote_count \n",
+ " release_year \n",
+ " predicted_rating \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " Toy Story \n",
+ " 7.7 \n",
+ " 5415.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 2 \n",
+ " Jumanji \n",
+ " 6.9 \n",
+ " 2413.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 3 \n",
+ " Grumpier Old Men \n",
+ " 6.5 \n",
+ " 92.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 4 \n",
+ " Waiting to Exhale \n",
+ " 6.1 \n",
+ " 34.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 5 \n",
+ " Father of the Bride Part II \n",
+ " 5.7 \n",
+ " 173.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " 6 \n",
+ " Heat \n",
+ " 7.7 \n",
+ " 1886.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " 7 \n",
+ " Sabrina \n",
+ " 6.2 \n",
+ " 141.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " 8 \n",
+ " Tom and Huck \n",
+ " 5.4 \n",
+ " 45.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " 9 \n",
+ " Sudden Death \n",
+ " 5.5 \n",
+ " 174.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " 10 \n",
+ " GoldenEye \n",
+ " 6.6 \n",
+ " 1194.0 \n",
+ " 1995 \n",
+ " 3.545905 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "hybrid_recommendations",
+ "summary": "{\n \"name\": \"hybrid_recommendations\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"movieId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 1,\n \"max\": 10,\n \"num_unique_values\": 10,\n \"samples\": [\n 9,\n 2,\n 6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Sudden Death\",\n \"Jumanji\",\n \"Heat\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.82334008094401,\n \"min\": 5.4,\n \"max\": 7.7,\n \"num_unique_values\": 9,\n \"samples\": [\n 5.5,\n 6.9,\n 6.2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1726.898571942712,\n \"min\": 34.0,\n \"max\": 5415.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 174.0,\n 2413.0,\n 1886.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"release_year\",\n \"properties\": {\n \"dtype\": \"int32\",\n \"num_unique_values\": 1,\n \"samples\": [\n 1995\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.681111291435602e-16,\n \"min\": 3.5459045285801785,\n \"max\": 3.5459045285801785,\n \"num_unique_values\": 1,\n \"samples\": [\n 3.5459045285801785\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "User 9999 is a cold-start user. Falling back to popularity baseline.\n",
+ "\n",
+ "Hybrid Recommendations for Cold-Start User 9999:\n"
+ ]
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ " title vote_count vote_average weighted_rating\n",
+ "0 Toy Story 5415.0 7.7 7.475872\n",
+ "1 Jumanji 2413.0 6.9 6.733342\n",
+ "2 Heat 1886.0 7.7 7.210355\n",
+ "3 GoldenEye 1194.0 6.6 6.487877\n",
+ "4 Casino 1343.0 7.8 7.156339\n",
+ "5 Ace Ventura: When Nature Calls 1128.0 6.1 6.230593\n",
+ "6 Twelve Monkeys 2470.0 7.4 7.082636\n",
+ "7 Se7en 5915.0 8.1 7.829482\n",
+ "8 Pocahontas 1509.0 6.7 6.559447\n",
+ "9 The Usual Suspects 3334.0 8.1 7.671762"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " title \n",
+ " vote_count \n",
+ " vote_average \n",
+ " weighted_rating \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Toy Story \n",
+ " 5415.0 \n",
+ " 7.7 \n",
+ " 7.475872 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Jumanji \n",
+ " 2413.0 \n",
+ " 6.9 \n",
+ " 6.733342 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " Heat \n",
+ " 1886.0 \n",
+ " 7.7 \n",
+ " 7.210355 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " GoldenEye \n",
+ " 1194.0 \n",
+ " 6.6 \n",
+ " 6.487877 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " Casino \n",
+ " 1343.0 \n",
+ " 7.8 \n",
+ " 7.156339 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Ace Ventura: When Nature Calls \n",
+ " 1128.0 \n",
+ " 6.1 \n",
+ " 6.230593 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " Twelve Monkeys \n",
+ " 2470.0 \n",
+ " 7.4 \n",
+ " 7.082636 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Se7en \n",
+ " 5915.0 \n",
+ " 8.1 \n",
+ " 7.829482 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Pocahontas \n",
+ " 1509.0 \n",
+ " 6.7 \n",
+ " 6.559447 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " The Usual Suspects \n",
+ " 3334.0 \n",
+ " 8.1 \n",
+ " 7.671762 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "cold_start_hybrid_recommendations",
+ "summary": "{\n \"name\": \"cold_start_hybrid_recommendations\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Pocahontas\",\n \"Jumanji\",\n \"Ace Ventura: When Nature Calls\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1728.550327104575,\n \"min\": 1128.0,\n \"max\": 5915.0,\n \"num_unique_values\": 10,\n \"samples\": [\n 1509.0,\n 2413.0,\n 1128.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6919376979018976,\n \"min\": 6.1,\n \"max\": 8.1,\n \"num_unique_values\": 8,\n \"samples\": [\n 6.9,\n 7.4,\n 7.7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"weighted_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.531410019489584,\n \"min\": 6.23059349726819,\n \"max\": 7.829481741760431,\n \"num_unique_values\": 10,\n \"samples\": [\n 6.55944698031551,\n 6.73334239370288,\n 6.23059349726819\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0a8ca7cb",
+ "outputId": "cb89b2ad-dedb-4ed6-cbe7-9969c352f284"
+ },
+ "source": [
+ "from collections import defaultdict\n",
+ "from surprise import Dataset, Reader, KNNBasic, SVD\n",
+ "from surprise.model_selection import train_test_split as surprise_train_test_split\n",
+ "from surprise import accuracy\n",
+ "import pandas as pd\n",
+ "\n",
+ "# Sort ratings by timestamp for each user\n",
+ "ratings_df_sorted = ratings_df.sort_values(['userId', 'timestamp'])\n",
+ "\n",
+ "# Implement leave-one-out-by-timestamp split\n",
+ "trainset_time = defaultdict(list)\n",
+ "testset_time = []\n",
+ "\n",
+ "# Group by user and split\n",
+ "for userId, user_ratings in ratings_df_sorted.groupby('userId'):\n",
+ " # Ensure there is at least one rating for training\n",
+ " if len(user_ratings) > 1:\n",
+ " # The last rating is for testing\n",
+ " test_rating = user_ratings.iloc[-1]\n",
+ " testset_time.append((test_rating['userId'], test_rating['movieId'], test_rating['rating']))\n",
+ "\n",
+ " # The rest are for training\n",
+ " train_ratings = user_ratings.iloc[:-1]\n",
+ " for index, row in train_ratings.iterrows():\n",
+ " trainset_time[row['userId']].append((row['movieId'], row['rating']))\n",
+ " elif len(user_ratings) == 1:\n",
+ " # If a user only has one rating, use it for testing and exclude from training\n",
+ " test_rating = user_ratings.iloc[-1]\n",
+ " testset_time.append((test_rating['userId'], test_rating['movieId'], test_rating['rating']))\n",
+ "\n",
+ "\n",
+ "# Create a pandas DataFrame from the time-aware training data\n",
+ "train_ratings_list = []\n",
+ "for userId, ratings in trainset_time.items():\n",
+ " for movieId, rating in ratings:\n",
+ " train_ratings_list.append({'userId': userId, 'movieId': movieId, 'rating': rating})\n",
+ "train_ratings_df_time = pd.DataFrame(train_ratings_list)\n",
+ "\n",
+ "\n",
+ "# Load the time-aware training data into Surprise's format\n",
+ "reader = Reader(rating_scale=(1, 5))\n",
+ "data_train_time = Dataset.load_from_df(train_ratings_df_time[['userId', 'movieId', 'rating']], reader)\n",
+ "trainset_time_surprise = data_train_time.build_full_trainset()\n",
+ "\n",
+ "# Train k-NN and SVD models on the time-aware training set\n",
+ "knn_user_based_time = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40)\n",
+ "knn_user_based_time.fit(trainset_time_surprise)\n",
+ "\n",
+ "svd_mf_time = SVD(random_state=42)\n",
+ "svd_mf_time.fit(trainset_time_surprise)\n",
+ "\n",
+ "\n",
+ "print(\"Time-aware data splitting complete.\")\n",
+ "print(f\"Number of training ratings: {len(train_ratings_df_time)}\")\n",
+ "print(f\"Number of testing ratings: {len(testset_time)}\")\n",
+ "\n",
+ "# The testset_time is already in the format needed for evaluation [(user, movie, rating)]\n",
+ "# We can now use these trained models to predict on this test set in the subsequent steps."
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Computing the msd similarity matrix...\n",
+ "Done computing similarity matrix.\n",
+ "Time-aware data splitting complete.\n",
+ "Number of training ratings: 99333\n",
+ "Number of testing ratings: 671\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "a6776d7a"
+ },
+ "source": [
+ "from collections import defaultdict\n",
+ "import numpy as np\n",
+ "\n",
+ "# Function to get top N recommendations for a user from predictions\n",
+ "def get_top_n(predictions, n=10):\n",
+ " \"\"\"Return the top-N recommendation for each user from a set of predictions.\n",
+ "\n",
+ " Args:\n",
+ " predictions (list of Prediction objects): The list of predictions, as returned by the test method of an algorithm.\n",
+ " n (int): The number of recommendation to output.\n",
+ " Returns:\n",
+ " A dict where keys are user (raw) ids and values are lists of tuples:\n",
+ " [(raw item id, predicted rating), ...]\n",
+ " \"\"\"\n",
+ " # First map the predictions to each user.\n",
+ " top_n = defaultdict(list)\n",
+ " for uid, iid, true_r, est, _ in predictions:\n",
+ " top_n[uid].append((iid, est))\n",
+ "\n",
+ " # Then sort the predictions for each user and retrieve the top N.\n",
+ " for uid, user_ratings in top_n.items():\n",
+ " user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
+ " top_n[uid] = user_ratings[:n]\n",
+ "\n",
+ " return top_n\n",
+ "\n",
+ "# Function to calculate Precision@K and Recall@K\n",
+ "def precision_recall_at_k(predictions, k=10, threshold=3.5):\n",
+ " \"\"\"Return precision and recall at k metrics for each user.\n",
+ "\n",
+ " Args:\n",
+ " predictions (list of Prediction objects): The list of predictions, as returned by the test method of an algorithm.\n",
+ " k (int): The number of recommendation to output.\n",
+ " threshold (float): The minimum rating considered relevant.\n",
+ " Returns:\n",
+ " A tuple of (precision, recall) at k.\n",
+ " \"\"\"\n",
+ " user_est_true = defaultdict(list)\n",
+ " for uid, _, true_r, est, _ in predictions:\n",
+ " user_est_true[uid].append((est, true_r))\n",
+ "\n",
+ " precisions = dict()\n",
+ " recalls = dict()\n",
+ " for uid, user_ratings in user_est_true.items():\n",
+ " # Sort user ratings by estimated value\n",
+ " user_ratings.sort(key=lambda x: x[0], reverse=True)\n",
+ "\n",
+ " # Number of relevant items\n",
+ " n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)\n",
+ "\n",
+ " # Number of recommended items in top k\n",
+ " n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])\n",
+ "\n",
+ " # Number of relevant and recommended items in top k\n",
+ " n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])\n",
+ "\n",
+ " # Precision@K: Proportion of recommended items that are relevant\n",
+ " precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1\n",
+ "\n",
+ " # Recall@K: Proportion of relevant items that are recommended\n",
+ " recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1\n",
+ "\n",
+ " # Return average precision and recall\n",
+ " return np.mean(list(precisions.values())), np.mean(list(recalls.values()))\n",
+ "\n",
+ "# Function to calculate NDCG@K (Normalized Discounted Cumulative Gain)\n",
+ "def ndcg_at_k(predictions, k=10, threshold=3.5):\n",
+ " \"\"\"Calculate NDCG@K for each user and return the average.\n",
+ "\n",
+ " Args:\n",
+ " predictions (list of Prediction objects): The list of predictions, as returned by the test method of an algorithm.\n",
+ " k (int): The number of recommendation to output.\n",
+ " threshold (float): The minimum rating considered relevant.\n",
+ " Returns:\n",
+ " The average NDCG@K.\n",
+ " \"\"\"\n",
+ " user_ratings = defaultdict(list)\n",
+ " for uid, _, true_r, est, _ in predictions:\n",
+ " user_ratings[uid].append((est, true_r))\n",
+ "\n",
+ " ndcgs = []\n",
+ " for uid, ratings in user_ratings.items():\n",
+ " # Sort user ratings by estimated value to get the recommended list order\n",
+ " ratings.sort(key=lambda x: x[0], reverse=True)\n",
+ " recommended_list = ratings[:k]\n",
+ "\n",
+ " # Calculate DCG (Discounted Cumulative Gain)\n",
+ " dcg = 0\n",
+ " for i, (est, true_r) in enumerate(recommended_list):\n",
+ " # Use relevance score based on the threshold (1 if relevant, 0 otherwise)\n",
+ " relevance = 1 if true_r >= threshold else 0\n",
+ " dcg += relevance / np.log2(i + 2) # log2(i+1) for 0-based indexing\n",
+ "\n",
+ " # Calculate IDCG (Ideal Discounted Cumulative Gain)\n",
+ " # Sort true ratings to get the ideal list order\n",
+ " true_relevant_ratings = sorted([true_r for (_, true_r) in ratings if true_r >= threshold], reverse=True)\n",
+ " idcg = 0\n",
+ " for i, true_r in enumerate(true_relevant_ratings[:k]):\n",
+ " # Relevance is 1 for ideal ranking of relevant items\n",
+ " idcg += 1 / np.log2(i + 2)\n",
+ "\n",
+ " # Calculate NDCG\n",
+ " ndcg = dcg / idcg if idcg != 0 else 0\n",
+ " ndcgs.append(ndcg)\n",
+ "\n",
+ " # Return average NDCG\n",
+ " return np.mean(ndcgs)\n",
+ "\n",
+ "\n",
+ "# Function to calculate Catalog Coverage\n",
+ "def catalog_coverage(top_n_recommendations, all_movie_ids):\n",
+ " \"\"\"Calculate the percentage of the catalog that is ever recommended.\n",
+ "\n",
+ " Args:\n",
+ " top_n_recommendations (dict): Dictionary of top N recommendations for each user.\n",
+ " all_movie_ids (list): List of all unique movie IDs in the catalog.\n",
+ " Returns:\n",
+ " float: Catalog coverage percentage.\n",
+ " \"\"\"\n",
+ " recommended_items = set()\n",
+ " for user, items in top_n_recommendations.items():\n",
+ " for (iid, _) in items:\n",
+ " recommended_items.add(iid)\n",
+ "\n",
+ " return len(recommended_items) / len(all_movie_ids) * 100 if len(all_movie_ids) > 0 else 0\n",
+ "\n",
+ "# Function to calculate Novelty\n",
+ "def novelty(top_n_recommendations, item_popularity):\n",
+ " \"\"\"Calculate the average novelty of recommended items.\n",
+ "\n",
+ " Novelty is measured as the negative log of the probability of recommending an item (1 - popularity).\n",
+ " A higher score means more novel recommendations.\n",
+ "\n",
+ " Args:\n",
+ " top_n_recommendations (dict): Dictionary of top N recommendations for each user.\n",
+ " item_popularity (dict): Dictionary where keys are item IDs and values are their popularity (e.g., number of ratings).\n",
+ " Returns:\n",
+ " float: Average novelty score.\n",
+ " \"\"\"\n",
+ " novelty_sum = 0\n",
+ " count = 0\n",
+ " for user, items in top_n_recommendations.items():\n",
+ " for (iid, _) in items:\n",
+ " # Avoid log(0) by adding a small epsilon or handling items not in popularity\n",
+ " item_pop = item_popularity.get(iid, 0) + 1e-6 # Add small epsilon for safety\n",
+ " novelty_sum += -np.log2(item_pop / sum(item_popularity.values()))\n",
+ " count += 1\n",
+ "\n",
+ " return novelty_sum / count if count > 0 else 0\n",
+ "\n",
+ "\n",
+ "# Function to calculate Intra-List Diversity\n",
+ "def intra_list_diversity(top_n_recommendations, item_similarity_matrix):\n",
+ " \"\"\"Calculate the average intra-list diversity of recommended lists.\n",
+ "\n",
+ " Diversity is measured as the average pairwise dissimilarity between items in a recommendation list.\n",
+ " Dissimilarity can be 1 - similarity.\n",
+ "\n",
+ " Args:\n",
+ " top_n_recommendations (dict): Dictionary of top N recommendations for each user.\n",
+ " item_similarity_matrix (dict of dict): Dictionary representing item-item similarity.\n",
+ " item_similarity_matrix[i][j] is the similarity between item i and item j.\n",
+ " Returns:\n",
+ " float: Average intra-list diversity score.\n",
+ " \"\"\"\n",
+ " diversity_sum = 0\n",
+ " num_lists = 0\n",
+ " for user, items in top_n_recommendations.items():\n",
+ " item_ids = [iid for (iid, _) in items]\n",
+ " if len(item_ids) > 1:\n",
+ " list_diversity = 0\n",
+ " num_pairs = 0\n",
+ " for i in range(len(item_ids)):\n",
+ " for j in range(i + 1, len(item_ids)):\n",
+ " item1 = item_ids[i]\n",
+ " item2 = item_ids[j]\n",
+ " # Get similarity, handle cases where similarity is not defined (e.g., new items)\n",
+ " similarity = item_similarity_matrix.get(item1, {}).get(item2, 0) # Default to 0 similarity\n",
+ " dissimilarity = 1 - similarity\n",
+ " list_diversity += dissimilarity\n",
+ " num_pairs += 1\n",
+ " diversity_sum += list_diversity / num_pairs if num_pairs > 0 else 0\n",
+ " num_lists += 1\n",
+ "\n",
+ " return diversity_sum / num_lists if num_lists > 0 else 0\n",
+ "\n",
+ "# We'll need item popularity and item similarity for beyond-accuracy metrics later.\n",
+ "# Item popularity can be calculated from the training data (number of ratings for each item).\n",
+ "# Item similarity can be calculated based on item features (e.g., cosine similarity of SVD features)."
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "91cb5652",
+ "outputId": "03647ff8-291b-4abf-ceb9-b4791c17db1f"
+ },
+ "source": [
+ "from surprise.prediction_algorithms.predictions import Prediction\n",
+ "# Evaluate Popularity Baseline on the time-aware test set\n",
+ "# The popularity baseline ranks movies based on weighted rating.\n",
+ "# For evaluation on the testset_time, we need to predict a rating for each (user, movie) pair in testset_time\n",
+ "# using the popularity model. The popularity model doesn't predict per-user ratings, it provides a global ranking.\n",
+ "# To evaluate its ranking performance on the test set, we can rank the movies the user *hasn't* seen in the training set\n",
+ "# based on popularity and see where the test movie falls in that ranking.\n",
+ "# However, for simplicity in using the defined metrics (which expect Surprise-like predictions),\n",
+ "# we can assign the weighted rating as the \"predicted rating\" for the popularity model.\n",
+ "\n",
+ "popularity_predictions_time = []\n",
+ "# Assuming 'popular_movies_unique' is available from the popularity baseline step\n",
+ "# Create a dictionary for quick lookup of weighted ratings\n",
+ "popular_movies_dict = popular_movies_unique.set_index('movieId')['weighted_rating'].to_dict()\n",
+ "\n",
+ "for userId, movieId, actualRating in testset_time:\n",
+ " # Get the weighted rating for the movie from the popularity model\n",
+ " # If the movie is not in the popular movies list, assign a default low rating\n",
+ " predicted_rating = popular_movies_dict.get(movieId, 0.0) # Assign 0 if not in popular list\n",
+ " popularity_predictions_time.append(Prediction(uid=str(userId), iid=str(movieId), r_ui=actualRating, est=predicted_rating, details={}))\n",
+ "\n",
+ "\n",
+ "# Evaluate Content-Based Recommender on the time-aware test set\n",
+ "# We need to generate predictions for each (user, movie) pair in testset_time\n",
+ "# using the get_content_based_score function and the user profiles from training data.\n",
+ "\n",
+ "content_based_predictions_time = []\n",
+ "# Assuming 'user_profiles_train', 'merged_df_with_tfidf', 'unique_movies_reduced' are available\n",
+ "# and 'get_content_based_score' function is defined\n",
+ "\n",
+ "for userId, movieId, actualRating in testset_time:\n",
+ " # Get the content-based score\n",
+ " predicted_rating = get_content_based_score(\n",
+ " userId,\n",
+ " movieId,\n",
+ " merged_df_with_tfidf, # Use the original merged df for filtering seen movies\n",
+ " unique_movies_reduced, # Use unique movies with SVD features\n",
+ " user_profiles_train # Use user profiles trained on training data\n",
+ " )\n",
+ " content_based_predictions_time.append(Prediction(uid=str(userId), iid=str(movieId), r_ui=actualRating, est=predicted_rating, details={}))\n",
+ "\n",
+ "\n",
+ "# Evaluate Collaborative Filtering (k-NN) on the time-aware test set\n",
+ "# We use the trained knn_user_based_time model and the testset_time list.\n",
+ "\n",
+ "knn_predictions_time = knn_user_based_time.test(testset_time)\n",
+ "\n",
+ "\n",
+ "# Evaluate Collaborative Filtering (SVD) on the time-aware test set\n",
+ "# We use the trained svd_mf_time model and the testset_time list.\n",
+ "\n",
+ "svd_predictions_time = svd_mf_time.test(testset_time)\n",
+ "\n",
+ "\n",
+ "# Evaluate Hybrid Recommender on the time-aware test set\n",
+ "# We use the hybrid_prediction function with the best_alpha and the trained models.\n",
+ "\n",
+ "hybrid_predictions_time = []\n",
+ "# Assuming 'best_alpha', 'get_content_based_score', 'knn_user_based_time', 'svd_mf_time',\n",
+ "# 'merged_df_with_tfidf', 'unique_movies_reduced', 'user_profiles_train' are available\n",
+ "\n",
+ "for userId, movieId, actualRating in testset_time:\n",
+ " # Get the hybrid predicted rating\n",
+ " predicted_rating = hybrid_prediction(\n",
+ " userId,\n",
+ " movieId,\n",
+ " best_alpha, # Use the best alpha found during tuning\n",
+ " get_content_based_score,\n",
+ " knn_user_based_time, # Use k-NN trained on time-aware data\n",
+ " svd_mf_time, # Use SVD trained on time-aware data\n",
+ " merged_df_with_tfidf,\n",
+ " unique_movies_reduced,\n",
+ " user_profiles_train\n",
+ " )\n",
+ " hybrid_predictions_time.append(Prediction(uid=str(userId), iid=str(movieId), r_ui=actualRating, est=predicted_rating, details={}))\n",
+ "\n",
+ "print(\"Predictions generated for all models on the time-aware test set.\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Predictions generated for all models on the time-aware test set.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from collections import defaultdict\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 1. Helper: Convert predictions to ranked lists\n",
+ "# ----------------------------\n",
+ "def predictions_to_ranked_lists(predictions, k=10):\n",
+ " user_items = defaultdict(list)\n",
+ " for pred in predictions:\n",
+ " user_items[pred.uid].append((pred.iid, pred.est, pred.r_ui))\n",
+ " ranked_lists = {}\n",
+ " for uid, items in user_items.items():\n",
+ " ranked = sorted(items, key=lambda x: x[1], reverse=True)[:k]\n",
+ " ranked_lists[uid] = ranked\n",
+ " return ranked_lists\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 2. Ranking Metrics\n",
+ "# ----------------------------\n",
+ "def precision_at_k(ranked_lists, k=10):\n",
+ " precisions = []\n",
+ " for uid, items in ranked_lists.items():\n",
+ " relevant = [iid for iid, _, r_ui in items if r_ui >= 4]\n",
+ " precisions.append(len(relevant) / k)\n",
+ " return np.mean(precisions)\n",
+ "\n",
+ "def recall_at_k(ranked_lists, testset, k=10):\n",
+ " recalls = []\n",
+ " test_relevant = defaultdict(set)\n",
+ " for uid, iid, r_ui in testset:\n",
+ " if r_ui >= 4:\n",
+ " test_relevant[str(uid)].add(str(iid))\n",
+ " for uid, items in ranked_lists.items():\n",
+ " recommended = {iid for iid, _, _ in items}\n",
+ " relevant = test_relevant.get(uid, set())\n",
+ " if relevant:\n",
+ " recalls.append(len(recommended & relevant) / len(relevant))\n",
+ " return np.mean(recalls)\n",
+ "\n",
+ "def ndcg_at_k(ranked_lists, k=10):\n",
+ " ndcgs = []\n",
+ " for uid, items in ranked_lists.items():\n",
+ " dcg = 0.0\n",
+ " idcg = 0.0\n",
+ " rels = [1 if r_ui >= 4 else 0 for _, _, r_ui in items]\n",
+ " for i, rel in enumerate(rels):\n",
+ " dcg += (2**rel - 1) / np.log2(i + 2)\n",
+ " ideal_rels = sorted(rels, reverse=True)\n",
+ " for i, rel in enumerate(ideal_rels):\n",
+ " idcg += (2**rel - 1) / np.log2(i + 2)\n",
+ " if idcg > 0:\n",
+ " ndcgs.append(dcg / idcg)\n",
+ " return np.mean(ndcgs)\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 3. Beyond-Accuracy Metrics\n",
+ "# ----------------------------\n",
+ "def catalog_coverage(ranked_lists, all_items):\n",
+ " recommended_items = {iid for items in ranked_lists.values() for iid, _, _ in items}\n",
+ " return len(recommended_items) / len(all_items)\n",
+ "\n",
+ "def novelty(ranked_lists, item_popularity):\n",
+ " novelties = []\n",
+ " for items in ranked_lists.values():\n",
+ " for iid, _, _ in items:\n",
+ " novelties.append(-np.log2(item_popularity.get(iid, 1e-9)))\n",
+ " return np.mean(novelties)\n",
+ "\n",
+ "def intra_list_diversity(ranked_lists, item_features):\n",
+ " diversities = []\n",
+ " for items in ranked_lists.values():\n",
+ " iids = [iid for iid, _, _ in items]\n",
+ " features = [item_features[iid] for iid in iids if iid in item_features]\n",
+ " if len(features) > 1:\n",
+ " sims = cosine_similarity(features)\n",
+ " upper_tri = sims[np.triu_indices_from(sims, k=1)]\n",
+ " diversities.append(1 - np.mean(upper_tri))\n",
+ " return np.mean(diversities)\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 4. Evaluation Function\n",
+ "# ----------------------------\n",
+ "def evaluate_model(predictions, testset, all_items, item_popularity, item_features, k=10):\n",
+ " ranked_lists = predictions_to_ranked_lists(predictions, k=k)\n",
+ " return {\n",
+ " \"Precision@K\": precision_at_k(ranked_lists, k),\n",
+ " \"Recall@K\": recall_at_k(ranked_lists, testset, k),\n",
+ " \"NDCG@K\": ndcg_at_k(ranked_lists, k),\n",
+ " \"Coverage\": catalog_coverage(ranked_lists, all_items),\n",
+ " \"Novelty\": novelty(ranked_lists, item_popularity),\n",
+ " \"Diversity\": intra_list_diversity(ranked_lists, item_features)\n",
+ " }\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 5. Prepare inputs (adapted to your data)\n",
+ "# ----------------------------\n",
+ "all_items = set(merged_df_with_tfidf['movieId'].astype(str).unique())\n",
+ "item_popularity = merged_df_with_tfidf['movieId'].value_counts().to_dict()\n",
+ "item_popularity = {str(k): v for k, v in item_popularity.items()}\n",
+ "\n",
+ "# use ALL svd_* columns as item features\n",
+ "svd_cols = [col for col in unique_movies_reduced.columns if col.startswith(\"svd_\")]\n",
+ "item_features = {\n",
+ " str(row.movieId): row[svd_cols].values\n",
+ " for _, row in unique_movies_reduced.iterrows()\n",
+ "}\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 6. Run Evaluation for All Models\n",
+ "# ----------------------------\n",
+ "results = {}\n",
+ "for name, preds in [\n",
+ " (\"Popularity\", popularity_predictions_time),\n",
+ " (\"Content-Based\", content_based_predictions_time),\n",
+ " (\"kNN\", knn_predictions_time),\n",
+ " (\"SVD\", svd_predictions_time),\n",
+ " (\"Hybrid\", hybrid_predictions_time)\n",
+ "]:\n",
+ " results[name] = evaluate_model(preds, testset_time, all_items, item_popularity, item_features, k=10)\n",
+ "\n",
+ "results_df = pd.DataFrame(results).T\n",
+ "print(results_df)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6uziHYK-E9JN",
+ "outputId": "e1512e44-5ac0-4840-ded1-450f160af106"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " Precision@K Recall@K NDCG@K Coverage Novelty Diversity\n",
+ "Popularity 0.055738 1.0 1.0 0.064311 29.897353 NaN\n",
+ "Content-Based 0.055738 1.0 1.0 0.064311 29.897353 NaN\n",
+ "kNN 0.055738 NaN 1.0 0.064311 29.897353 NaN\n",
+ "SVD 0.055738 NaN 1.0 0.064311 29.897353 NaN\n",
+ "Hybrid 0.055738 1.0 1.0 0.064311 29.897353 NaN\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from collections import defaultdict\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 1. Leave-One-Out Ranking Evaluation\n",
+ "# ----------------------------\n",
+ "def evaluate_model(predictions, trainset, testset, all_items, item_popularity, item_features, k=10):\n",
+ " \"\"\"\n",
+ " predictions: list of Surprise Prediction objects\n",
+ " trainset: list of (uid, iid, rating) from training data\n",
+ " testset: list of (uid, iid, rating) from test data\n",
+ " all_items: set of all item IDs\n",
+ " item_popularity: dict of item -> count\n",
+ " item_features: dict of item -> feature vector\n",
+ " k: cutoff for metrics\n",
+ " \"\"\"\n",
+ " # Test items (only relevant ones, e.g. r_ui >= 4)\n",
+ " test_items = {str(uid): str(iid) for uid, iid, r_ui in testset if r_ui >= 4}\n",
+ "\n",
+ " # Build dictionary of predictions\n",
+ " pred_dict = defaultdict(dict)\n",
+ " for p in predictions:\n",
+ " pred_dict[p.uid][p.iid] = p.est\n",
+ "\n",
+ " precisions, recalls, ndcgs = [], [], []\n",
+ " recommended_items = set()\n",
+ " novelty_scores, diversity_scores = [], []\n",
+ "\n",
+ " for uid in test_items:\n",
+ " # Candidate items = all unseen in training\n",
+ " seen_items = {str(iid) for u, iid, _ in trainset if str(u) == uid}\n",
+ " candidates = all_items - seen_items\n",
+ "\n",
+ " # Score all candidates\n",
+ " scored = [(iid, pred_dict[uid].get(iid, 0.0)) for iid in candidates]\n",
+ " ranked = sorted(scored, key=lambda x: x[1], reverse=True)[:k]\n",
+ " rec_items = [iid for iid, _ in ranked]\n",
+ "\n",
+ " recommended_items.update(rec_items)\n",
+ "\n",
+ " # Precision, Recall, NDCG\n",
+ " test_item = test_items[uid]\n",
+ " if test_item in rec_items:\n",
+ " precisions.append(1.0 / k)\n",
+ " recalls.append(1.0) # one relevant item per user\n",
+ " ndcgs.append(1.0 / np.log2(rec_items.index(test_item) + 2))\n",
+ " else:\n",
+ " precisions.append(0.0)\n",
+ " recalls.append(0.0)\n",
+ " ndcgs.append(0.0)\n",
+ "\n",
+ " # Novelty\n",
+ " for iid in rec_items:\n",
+ " novelty_scores.append(-np.log2(item_popularity.get(iid, 1e-9)))\n",
+ "\n",
+ " # Diversity\n",
+ " feats = [item_features[iid] for iid in rec_items if iid in item_features]\n",
+ " if len(feats) > 1:\n",
+ " sims = cosine_similarity(feats)\n",
+ " upper_tri = sims[np.triu_indices_from(sims, k=1)]\n",
+ " diversity_scores.append(1 - np.mean(upper_tri))\n",
+ "\n",
+ " results = {\n",
+ " \"Precision@K\": np.mean(precisions),\n",
+ " \"Recall@K\": np.mean(recalls),\n",
+ " \"NDCG@K\": np.mean(ndcgs),\n",
+ " \"Coverage\": len(recommended_items) / len(all_items),\n",
+ " \"Novelty\": np.mean(novelty_scores),\n",
+ " \"Diversity\": np.mean(diversity_scores) if diversity_scores else np.nan\n",
+ " }\n",
+ " return results\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 2. Prepare inputs\n",
+ "# ----------------------------\n",
+ "all_items = set(merged_df_with_tfidf['movieId'].astype(str).unique())\n",
+ "item_popularity = merged_df_with_tfidf['movieId'].value_counts().to_dict()\n",
+ "item_popularity = {str(k): v for k, v in item_popularity.items()}\n",
+ "\n",
+ "# use all svd_* columns as features\n",
+ "svd_cols = [c for c in unique_movies_reduced.columns if c.startswith(\"svd_\")]\n",
+ "item_features = {\n",
+ " str(row.movieId): row[svd_cols].values\n",
+ " for _, row in unique_movies_reduced.iterrows()\n",
+ "}\n",
+ "\n",
+ "# Convert trainset_time into a list of (uid, iid, rating)\n",
+ "try:\n",
+ " # If it's a Surprise Trainset\n",
+ " trainset_list = [(str(trainset_time.to_raw_uid(u)),\n",
+ " str(trainset_time.to_raw_iid(i)),\n",
+ " r)\n",
+ " for (u, i, r) in trainset_time.all_ratings()]\n",
+ "except:\n",
+ " # If it's already list/df\n",
+ " trainset_list = [(str(row[0]), str(row[1]), row[2]) for row in trainset_time]\n",
+ "\n",
+ "# ----------------------------\n",
+ "# 3. Run Evaluation for All Models\n",
+ "# ----------------------------\n",
+ "results = {}\n",
+ "for name, preds in [\n",
+ " (\"Popularity\", popularity_predictions_time),\n",
+ " (\"Content-Based\", content_based_predictions_time),\n",
+ " (\"kNN\", knn_predictions_time),\n",
+ " (\"SVD\", svd_predictions_time),\n",
+ " (\"Hybrid\", hybrid_predictions_time)\n",
+ "]:\n",
+ " results[name] = evaluate_model(preds, trainset_list, testset_time, all_items, item_popularity, item_features, k=10)\n",
+ "\n",
+ "results_df = pd.DataFrame(results).T\n",
+ "print(results_df)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 406
+ },
+ "id": "TRcS7UYEF2Qx",
+ "outputId": "284817a5-cbf7-4a39-b412-d8a014991b5d"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "error",
+ "ename": "IndexError",
+ "evalue": "invalid index to scalar variable.",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/tmp/ipython-input-2271698075.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 94\u001b[0m r)\n\u001b[0;32m---> 95\u001b[0;31m for (u, i, r) in trainset_time.all_ratings()]\n\u001b[0m\u001b[1;32m 96\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mAttributeError\u001b[0m: 'collections.defaultdict' object has no attribute 'all_ratings'",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/tmp/ipython-input-2271698075.py\u001b[0m in \u001b[0;36m| \u001b[0;34m()\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0;32mexcept\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;31m# If it's already list/df\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0mtrainset_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtrainset_time\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0;31m# ----------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mIndexError\u001b[0m: invalid index to scalar variable."
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/report/images/budget_vs_revenue.png b/report/images/budget_vs_revenue.png
new file mode 100644
index 0000000000000000000000000000000000000000..d65b0b4db34bb82ae2d72afe4c044906f01534ec
Binary files /dev/null and b/report/images/budget_vs_revenue.png differ
diff --git a/report/images/budget_vs_revenue_filtered.png b/report/images/budget_vs_revenue_filtered.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa35ce86ee4bdaa4054fce4664bf257aa880938b
Binary files /dev/null and b/report/images/budget_vs_revenue_filtered.png differ
diff --git a/report/images/df_missing.png b/report/images/df_missing.png
new file mode 100644
index 0000000000000000000000000000000000000000..efa496ab0c727fc41904a880c653b7c1dae54027
--- /dev/null
+++ b/report/images/df_missing.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a62aab31beaa4b2578bb615811000c865e963e48f875fb32be1a907faad33b6
+size 473576
diff --git a/report/images/movies_by_decade_pie.png b/report/images/movies_by_decade_pie.png
new file mode 100644
index 0000000000000000000000000000000000000000..f6313e9102aba1d3d21fa11bc664bca53a8475ad
Binary files /dev/null and b/report/images/movies_by_decade_pie.png differ
diff --git a/report/images/popularity_distribution.png b/report/images/popularity_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..d46a52a8fc3124099f8e31e194f8f6b30530f482
Binary files /dev/null and b/report/images/popularity_distribution.png differ
diff --git a/report/images/popularity_distribution_lt10.png b/report/images/popularity_distribution_lt10.png
new file mode 100644
index 0000000000000000000000000000000000000000..8a1ce6fd3550139c555726addac772c60fce0f58
Binary files /dev/null and b/report/images/popularity_distribution_lt10.png differ
diff --git a/report/images/popularity_distribution_lt100.png b/report/images/popularity_distribution_lt100.png
new file mode 100644
index 0000000000000000000000000000000000000000..06ab8ebf3afcead8073b5b5e790aac59550625b2
Binary files /dev/null and b/report/images/popularity_distribution_lt100.png differ
diff --git a/report/images/rating_distribution.png b/report/images/rating_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..5761e64cc2033c93d7bccee884d6e20841f5c335
Binary files /dev/null and b/report/images/rating_distribution.png differ
diff --git a/report/images/release_year_distribution.png b/report/images/release_year_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c88df34d6716a9b0f94cd4cdd6744a27201e51e
Binary files /dev/null and b/report/images/release_year_distribution.png differ
diff --git a/report/images/runtime_distribution.png b/report/images/runtime_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..688de284ae798e98cb39a663f700f710614f830c
Binary files /dev/null and b/report/images/runtime_distribution.png differ
diff --git a/report/images/top_genres.png b/report/images/top_genres.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa3ccdf0190a7e213b4d1b55b84a1883e5fb78d1
Binary files /dev/null and b/report/images/top_genres.png differ
diff --git a/report/images/top_languages.png b/report/images/top_languages.png
new file mode 100644
index 0000000000000000000000000000000000000000..5902c8b35984bc2bc527a112bbb13f37b98a713e
Binary files /dev/null and b/report/images/top_languages.png differ
diff --git a/report/images/top_production_companies.png b/report/images/top_production_companies.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a3e11836a6ba028d705ddc97f50193bb129f546
Binary files /dev/null and b/report/images/top_production_companies.png differ
diff --git a/report/images/top_production_countries.png b/report/images/top_production_countries.png
new file mode 100644
index 0000000000000000000000000000000000000000..25ea077cbaeb3b74e1dffb9103090a458ccbe153
Binary files /dev/null and b/report/images/top_production_countries.png differ
diff --git a/report/images/vote_average_distribution.png b/report/images/vote_average_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..8db2ccf3a949518473f47a77e54af0558e859bab
Binary files /dev/null and b/report/images/vote_average_distribution.png differ
diff --git a/report/images/vote_count_distribution.png b/report/images/vote_count_distribution.png
new file mode 100644
index 0000000000000000000000000000000000000000..e7c90d320c536ac19f9b0e66226d97b9985f807a
Binary files /dev/null and b/report/images/vote_count_distribution.png differ
diff --git a/report/images/vote_count_vs_average.png b/report/images/vote_count_vs_average.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a6c74d24a1d205f54c253f277d6e18f3d9cff08
Binary files /dev/null and b/report/images/vote_count_vs_average.png differ
diff --git a/report/images/wordcloud_overview.png b/report/images/wordcloud_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..dec4d35786ca054b83421405962351eb001b1393
--- /dev/null
+++ b/report/images/wordcloud_overview.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9ee4349be9564ed7c5161b5ab442159fa67031589eb87b84742a5f71a8be378
+size 616788
diff --git a/report/images/wordcloud_title.png b/report/images/wordcloud_title.png
new file mode 100644
index 0000000000000000000000000000000000000000..05a6fc8cd05ad67ebbc7fc95ef9e4175c0f23500
--- /dev/null
+++ b/report/images/wordcloud_title.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96d11bd23abcb7b66230c0fb596c74e6a5a65d03398b939d7ee9c352d7435a4a
+size 626140
diff --git a/report/images/world_production_map.png b/report/images/world_production_map.png
new file mode 100644
index 0000000000000000000000000000000000000000..a611a12864e692e1ca58e871977234ca8fffff1d
Binary files /dev/null and b/report/images/world_production_map.png differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eeb14eb450938f4e0559109f06c1bcac8def61a8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+pandas
+numpy
+matplotlib
+seaborn
+missingno
+wordcloud
+plotly
+pycountry
+kaleido
+scikit-learn
+scikit-surprise
+gradio
\ No newline at end of file
diff --git a/src/__pycache__/collaborative.cpython-310.pyc b/src/__pycache__/collaborative.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..443635898e3d3e1ce3f675ff83e37d682facd2e6
Binary files /dev/null and b/src/__pycache__/collaborative.cpython-310.pyc differ
diff --git a/src/__pycache__/collaborative.cpython-313.pyc b/src/__pycache__/collaborative.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eeedbdc4da3da62a3fa6ab0d9a20c6dfc6671337
Binary files /dev/null and b/src/__pycache__/collaborative.cpython-313.pyc differ
diff --git a/src/__pycache__/content_based.cpython-310.pyc b/src/__pycache__/content_based.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7471b228f8dd8c7e27d5f198192f173ea721382
Binary files /dev/null and b/src/__pycache__/content_based.cpython-310.pyc differ
diff --git a/src/__pycache__/content_based.cpython-313.pyc b/src/__pycache__/content_based.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fccdc08736a696dfbfa6d335c0d917d346a7af87
Binary files /dev/null and b/src/__pycache__/content_based.cpython-313.pyc differ
diff --git a/src/__pycache__/eda.cpython-310.pyc b/src/__pycache__/eda.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86fba4ddf830b423ab445769981aff4eb5d2937e
Binary files /dev/null and b/src/__pycache__/eda.cpython-310.pyc differ
diff --git a/src/__pycache__/eda.cpython-313.pyc b/src/__pycache__/eda.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53ac89aec5220986567badc0e872ef4b6170bcbc
Binary files /dev/null and b/src/__pycache__/eda.cpython-313.pyc differ
diff --git a/src/__pycache__/evaluation.cpython-310.pyc b/src/__pycache__/evaluation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..511d92467e40a0087c5ef9fdeacf34feb5fcc1d2
Binary files /dev/null and b/src/__pycache__/evaluation.cpython-310.pyc differ
diff --git a/src/__pycache__/feature_engineering.cpython-310.pyc b/src/__pycache__/feature_engineering.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..638c63c53c71530ef13b617b1ac7e703f278008b
Binary files /dev/null and b/src/__pycache__/feature_engineering.cpython-310.pyc differ
diff --git a/src/__pycache__/feature_engineering.cpython-313.pyc b/src/__pycache__/feature_engineering.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00f6cceff7373702894b94d72f475b9f49cd5ff1
Binary files /dev/null and b/src/__pycache__/feature_engineering.cpython-313.pyc differ
diff --git a/src/__pycache__/hybrid.cpython-310.pyc b/src/__pycache__/hybrid.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8de2370eb1e12b236d9f02591718187ea688645f
Binary files /dev/null and b/src/__pycache__/hybrid.cpython-310.pyc differ
diff --git a/src/__pycache__/modeling.cpython-310.pyc b/src/__pycache__/modeling.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb60e14284c5f5e50525bc3cf780f6e8931b77b5
Binary files /dev/null and b/src/__pycache__/modeling.cpython-310.pyc differ
diff --git a/src/__pycache__/modeling.cpython-313.pyc b/src/__pycache__/modeling.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a5484915826f360ef3452fd349ca16892c3b938
Binary files /dev/null and b/src/__pycache__/modeling.cpython-313.pyc differ
diff --git a/src/__pycache__/preprocessing.cpython-310.pyc b/src/__pycache__/preprocessing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c22bba71aff7d75475d66514a941e3ccf914b77
Binary files /dev/null and b/src/__pycache__/preprocessing.cpython-310.pyc differ
diff --git a/src/__pycache__/preprocessing.cpython-313.pyc b/src/__pycache__/preprocessing.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8369c60e9bb34d93999c24935b2604319ded83e
Binary files /dev/null and b/src/__pycache__/preprocessing.cpython-313.pyc differ
diff --git a/src/eda.py b/src/eda.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffd804382dc55d5e35db4c1cd13883901ec37c3
--- /dev/null
+++ b/src/eda.py
@@ -0,0 +1,327 @@
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import pandas as pd
+from wordcloud import WordCloud, STOPWORDS
+import plotly.graph_objs as go
+import plotly.io as pio
+import pycountry
+
+
+class EDA:
+ def __init__(self, dfs):
+ self.df = dfs["df"]
+ self.credits_df = dfs["credits_df"]
+ self.keywords_df = dfs["keywords_df"]
+ self.links_df = dfs["links_df"]
+ self.ratings_df = dfs["ratings_df"]
+ self.merged_df = dfs["merged_df"]
+ self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
+ os.makedirs(self.img_path, exist_ok=True)
+
+ def plot_rating_distribution(self):
+ plt.figure(figsize=(10, 6))
+ sns.histplot(self.merged_df['rating'], bins=10, kde=False)
+ plt.title('Distribution of Movie Ratings')
+ plt.xlabel('Rating')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "rating_distribution.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_release_year_distribution(self):
+ df = self.merged_df.copy()
+ df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
+ df['release_year'] = df['release_date'].dt.year
+ plt.figure(figsize=(12, 6))
+ sns.histplot(df['release_year'].dropna(), bins=50, kde=False)
+ plt.title('Distribution of Movie Release Years')
+ plt.xlabel('Release Year')
+ plt.ylabel('Number of Movies')
+ plt.savefig(os.path.join(self.img_path, "release_year_distribution.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_budget_vs_revenue(self):
+ plt.figure(figsize=(10, 6))
+ sns.scatterplot(data=self.merged_df, x='budget', y='revenue')
+ plt.title('Relationship between Movie Budget and Revenue')
+ plt.xlabel('Budget')
+ plt.ylabel('Revenue')
+ plt.savefig(os.path.join(self.img_path, "budget_vs_revenue.png"), bbox_inches='tight')
+ plt.close()
+
+ # Convert 'budget' and 'revenue' to numeric, coercing errors to NaN
+ self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce')
+ self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce')
+
+ # Fill NaN values in 'budget' and 'revenue' with 0, as 0 budget/revenue is a meaningful value
+ self.merged_df['budget'] = self.merged_df['budget'].fillna(0)
+ self.merged_df['revenue'] = self.merged_df['revenue'].fillna(0)
+
+ # Filter out movies with zero budget AND zero revenue
+ filtered_df = self.merged_df[(self.merged_df['budget'] > 0) | (self.merged_df['revenue'] > 0)].copy()
+ plt.figure(figsize=(10, 6))
+ sns.scatterplot(data=filtered_df, x='budget', y='revenue')
+ plt.title('Relationship between Movie Budget and Revenue (Filtered)')
+ plt.xlabel('Budget')
+ plt.ylabel('Revenue')
+ plt.savefig(os.path.join(self.img_path, "budget_vs_revenue_filtered.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_genre_counts(self):
+ genre_counts = {}
+ for genres_list in self.df['genres'].dropna():
+ if isinstance(genres_list, str):
+ genres = [genre.strip() for genre in genres_list.split(',')]
+ for genre in genres:
+ if genre:
+ genre_counts[genre] = genre_counts.get(genre, 0) + 1
+ top_n = 15
+ top_genres = pd.Series(genre_counts).sort_values(ascending=False).head(top_n)
+ plt.figure(figsize=(12, 8))
+ sns.barplot(x=top_genres.index, y=top_genres.values, palette='viridis')
+ plt.title('Top Movie Genres by Frequency')
+ plt.xlabel('Genre')
+ plt.ylabel('Frequency')
+ plt.xticks(rotation=45, ha='right')
+ plt.tight_layout()
+ plt.savefig(os.path.join(self.img_path, "top_genres.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_popularity_distribution(self):
+ plt.figure(figsize=(10, 6))
+ sns.histplot(self.merged_df['popularity'], bins=50, kde=False)
+ plt.title('Distribution of Movie Popularity')
+ plt.xlabel('Popularity')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "popularity_distribution.png"), bbox_inches='tight')
+ plt.close()
+
+ filtered_popularity_df = self.merged_df[self.merged_df['popularity'] < 100].copy()
+ plt.figure(figsize=(10, 6))
+ sns.histplot(filtered_popularity_df['popularity'], bins=50, kde=False)
+ plt.title('Distribution of Movie Popularity (Popularity < 100)')
+ plt.xlabel('Popularity')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "popularity_distribution_lt100.png"), bbox_inches='tight')
+ plt.close()
+
+ filtered_popularity_df_low = self.merged_df[self.merged_df['popularity'] < 10].copy()
+ plt.figure(figsize=(10, 6))
+ sns.histplot(filtered_popularity_df_low['popularity'], bins=50, kde=False)
+ plt.title('Distribution of Movie Popularity (Popularity < 10)')
+ plt.xlabel('Popularity')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "popularity_distribution_lt10.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_runtime_distribution(self):
+ plt.figure(figsize=(10, 6))
+ sns.histplot(self.merged_df['runtime'].dropna(), bins=50, kde=False)
+ plt.title('Distribution of Movie Runtimes')
+ plt.xlabel('Runtime (minutes)')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "runtime_distribution.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_production_company_counts(self):
+ company_counts = {}
+ for companies_list in self.merged_df['production_companies'].dropna():
+ if isinstance(companies_list, str):
+ companies = [company.strip() for company in companies_list.split(',')]
+ for company in companies:
+ if company and company != 'Unknown':
+ company_counts[company] = company_counts.get(company, 0) + 1
+ top_n_companies = 15
+ top_companies = pd.Series(company_counts).sort_values(ascending=False).head(top_n_companies)
+ plt.figure(figsize=(14, 8))
+ sns.barplot(x=top_companies.index, y=top_companies.values, palette='viridis')
+ plt.title(f'Top {top_n_companies} Production Companies')
+ plt.xlabel('Production Company')
+ plt.ylabel('Frequency')
+ plt.xticks(rotation=45, ha='right')
+ plt.tight_layout()
+ plt.savefig(os.path.join(self.img_path, "top_production_companies.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_production_country_counts(self):
+ country_counts = {}
+ for countries_list in self.merged_df['production_countries'].dropna():
+ if isinstance(countries_list, str):
+ countries = [country.strip() for country in countries_list.split(',')]
+ for country in countries:
+ if country and country != 'Unknown':
+ country_counts[country] = country_counts.get(country, 0) + 1
+ top_n_countries = 15
+ top_countries = pd.Series(country_counts).sort_values(ascending=False).head(top_n_countries)
+ plt.figure(figsize=(14, 8))
+ sns.barplot(x=top_countries.index, y=top_countries.values, palette='magma')
+ plt.title(f'Top {top_n_countries} Production Countries')
+ plt.xlabel('Production Country')
+ plt.ylabel('Frequency')
+ plt.xticks(rotation=45, ha='right')
+ plt.tight_layout()
+ plt.savefig(os.path.join(self.img_path, "top_production_countries.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_language_counts(self):
+ language_counts = {}
+ for languages_list in self.merged_df['spoken_languages'].dropna():
+ if isinstance(languages_list, str):
+ languages = [lang.strip() for lang in languages_list.split(',')]
+ for lang in languages:
+ if lang and lang != 'Unknown':
+ language_counts[lang] = language_counts.get(lang, 0) + 1
+ language_counts_series = pd.Series(language_counts).sort_values(ascending=False)
+ top_languages = language_counts_series.head(15)
+ plt.figure(figsize=(12, 8))
+ sns.barplot(x=top_languages.index, y=top_languages.values, palette='viridis')
+ plt.title('Top 15 Spoken Languages')
+ plt.xlabel('Language')
+ plt.ylabel('Frequency')
+ plt.xticks(rotation=45, ha='right')
+ plt.tight_layout()
+ plt.savefig(os.path.join(self.img_path, "top_languages.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_vote_count_distribution(self):
+ plt.figure(figsize=(10, 6))
+ sns.histplot(self.merged_df['vote_count'], bins=50, kde=False)
+ plt.title('Distribution of Movie Vote Counts')
+ plt.xlabel('Vote Count')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "vote_count_distribution.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_vote_average_distribution(self):
+ plt.figure(figsize=(10, 6))
+ sns.histplot(self.merged_df['vote_average'], bins=20, kde=False)
+ plt.title('Distribution of Movie Vote Averages')
+ plt.xlabel('Vote Average')
+ plt.ylabel('Frequency')
+ plt.savefig(os.path.join(self.img_path, "vote_average_distribution.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_vote_count_vs_average(self):
+ plt.figure(figsize=(10, 6))
+ sns.scatterplot(data=self.merged_df, x='vote_count', y='vote_average')
+ plt.title('Relationship between Vote Count and Vote Average')
+ plt.xlabel('Vote Count')
+ plt.ylabel('Vote Average')
+ plt.savefig(os.path.join(self.img_path, "vote_count_vs_average.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_wordclouds(self):
+ copy = self.df.copy()
+ copy['title'] = copy['title'].astype('str')
+ copy['overview'] = copy['overview'].astype('str')
+ title_corpus = ' '.join(copy['title'])
+ overview_corpus = ' '.join(copy['overview'])
+
+ title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(title_corpus)
+ plt.figure(figsize=(16,8))
+ plt.imshow(title_wordcloud)
+ plt.axis('off')
+ plt.tight_layout()
+ plt.savefig(os.path.join(self.img_path, "wordcloud_title.png"), bbox_inches='tight')
+ plt.close()
+
+ overview_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(overview_corpus)
+ plt.figure(figsize=(16,8))
+ plt.imshow(overview_wordcloud)
+ plt.axis('off')
+ plt.tight_layout()
+ plt.savefig(os.path.join(self.img_path, "wordcloud_overview.png"), bbox_inches='tight')
+ plt.close()
+
+ def plot_world_production_map(self):
+
+ copy = self.df.copy()
+ country_counts = copy['production_countries'].value_counts().reset_index()
+ country_counts.columns = ['country', 'num_movies']
+ country_counts = country_counts[country_counts['country'] != "United States of America"]
+
+ def get_iso3(country_name):
+ try:
+ return pycountry.countries.lookup(country_name).alpha_3
+ except:
+ return None
+
+ country_counts['iso_alpha'] = country_counts['country'].apply(get_iso3)
+ country_counts = country_counts.dropna(subset=['iso_alpha'])
+
+ data = [go.Choropleth(
+ locations = country_counts['iso_alpha'],
+ z = country_counts['num_movies'],
+ text = country_counts['country'],
+ colorscale = [[0,'rgb(255,255,255)'], [1,'rgb(255,0,0)']],
+ autocolorscale = False,
+ reversescale = False,
+ marker = dict(line = dict(color='rgb(180,180,180)', width=0.5)),
+ colorbar = dict(title='Production Countries')
+ )]
+
+ layout = dict(
+ title = 'Production Countries for the MovieLens Movies (Apart from US)',
+ geo = dict(
+ showframe = False,
+ showcoastlines = False,
+ projection = dict(type = 'mercator')
+ )
+ )
+
+ fig = go.Figure(data=data, layout=layout)
+ # Save as static image (requires kaleido)
+ try:
+ # Use plotly.io.write_image for better compatibility
+ pio.write_image(fig, os.path.join(self.img_path, "world_production_map.png"))
+ except Exception:
+ # As a fallback, save as HTML if static image export fails
+ try:
+ fig.write_html(os.path.join(self.img_path, "world_production_map.html"))
+ except Exception:
+ pass
+
+ def plot_decade_pie(self):
+ import plotly.express as px
+ copy = self.df.copy()
+ copy['release_date'] = pd.to_datetime(copy['release_date'], errors='coerce')
+ copy['decade'] = (copy['release_date'].dt.year // 10) * 10
+ decade_counts = copy['decade'].value_counts().sort_index().reset_index()
+ decade_counts.columns = ['decade', 'num_movies']
+ decade_counts['decade'] = decade_counts['decade'].astype(int).astype(str) + "s"
+ fig = px.pie(
+ decade_counts,
+ names='decade',
+ values='num_movies',
+ title="Movies Distribution by Decade (Release Date)",
+ color_discrete_sequence=px.colors.qualitative.Set3
+ )
+ # Save as static image (requires kaleido)
+ try:
+ # Use plotly.io.write_image for better compatibility
+ pio.write_image(fig, os.path.join(self.img_path, "movies_by_decade_pie.png"))
+ except Exception:
+ # As a fallback, save as HTML if static image export fails
+ try:
+ fig.write_html(os.path.join(self.img_path, "movies_by_decade_pie.html"))
+ except Exception:
+ pass
+
+
+
+ def run_all(self):
+ self.plot_rating_distribution()
+ self.plot_release_year_distribution()
+ self.plot_budget_vs_revenue()
+ self.plot_genre_counts()
+ self.plot_popularity_distribution()
+ self.plot_runtime_distribution()
+ self.plot_production_company_counts()
+ self.plot_production_country_counts()
+ self.plot_language_counts()
+ self.plot_vote_count_distribution()
+ self.plot_vote_average_distribution()
+ self.plot_vote_count_vs_average()
+ self.plot_wordclouds()
+ self.plot_world_production_map()
+ self.plot_decade_pie()
diff --git a/src/evaluation.py b/src/evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2078f80b4fe0ee54b8e823bd344169709361ddc
--- /dev/null
+++ b/src/evaluation.py
@@ -0,0 +1,121 @@
+import pandas as pd
+import numpy as np
+from collections import defaultdict
+from sklearn.metrics.pairwise import cosine_similarity
+
+def leave_one_out_by_timestamp(ratings_df):
+ ratings_df = ratings_df.sort_values(['userId', 'timestamp'])
+ train_idx, test_idx = [], []
+ for user, group in ratings_df.groupby('userId'):
+ if len(group) > 1:
+ test_idx.append(group.index[-1])
+ train_idx.extend(group.index[:-1])
+ else:
+ test_idx.append(group.index[-1])
+ train = ratings_df.loc[train_idx]
+ test = ratings_df.loc[test_idx]
+ return train, test
+
+def precision_at_k(ranked_lists, k=10):
+ precisions = []
+ for uid, items in ranked_lists.items():
+ relevant = [r for _, _, r in items[:k] if r >= 4]
+ precisions.append(len(relevant) / k)
+ return np.mean(precisions)
+
+def recall_at_k(ranked_lists, test_truth, k=10):
+ recalls = []
+ truth = defaultdict(set)
+ # Accept both DataFrame and ndarray for test_truth
+ if isinstance(test_truth, pd.DataFrame):
+ for _, row in test_truth.iterrows():
+ uid, iid, r = row['userId'], row['movieId'], row['rating']
+ if r >= 4:
+ truth[uid].add(iid)
+ else:
+ for row in test_truth:
+ # row can be (uid, iid, r, ...) or (uid, iid, r)
+ uid, iid, r = row[:3]
+ if r >= 4:
+ truth[uid].add(iid)
+ for uid, items in ranked_lists.items():
+ recommended = {iid for iid, _, _ in items[:k]}
+ relevant = truth.get(uid, set())
+ if relevant:
+ recalls.append(len(recommended & relevant) / len(relevant))
+ return np.mean(recalls)
+
+def ndcg_at_k(ranked_lists, k=10):
+ ndcgs = []
+ for uid, items in ranked_lists.items():
+ dcg = 0.0
+ idcg = 0.0
+ rels = [1 if r >= 4 else 0 for _, _, r in items[:k]]
+ for i, rel in enumerate(rels):
+ dcg += (2**rel - 1) / np.log2(i + 2)
+ ideal_rels = sorted(rels, reverse=True)
+ for i, rel in enumerate(ideal_rels):
+ idcg += (2**rel - 1) / np.log2(i + 2)
+ if idcg > 0:
+ ndcgs.append(dcg / idcg)
+ return np.mean(ndcgs)
+
+def catalog_coverage(ranked_lists, all_items):
+ recommended = {iid for items in ranked_lists.values() for iid, _, _ in items}
+ return len(recommended) / len(all_items)
+
+def novelty(ranked_lists, item_popularity):
+ novelties = []
+ total = sum(item_popularity.values())
+ for items in ranked_lists.values():
+ for iid, _, _ in items:
+ p = item_popularity.get(iid, 1) / total
+ novelties.append(-np.log2(p + 1e-9))
+ return np.mean(novelties)
+
+def intra_list_diversity(ranked_lists, item_features):
+ diversities = []
+ for items in ranked_lists.values():
+ iids = [iid for iid, _, _ in items]
+ feats = [item_features[iid] for iid in iids if iid in item_features]
+ if len(feats) > 1:
+ sims = cosine_similarity(feats)
+ upper = sims[np.triu_indices_from(sims, k=1)]
+ diversities.append(1 - np.mean(upper))
+ return np.mean(diversities)
+
+def predictions_to_ranked_lists(predictions, k=20):
+ user_items = defaultdict(list)
+ for uid, iid, true_r, est, _ in predictions:
+ user_items[uid].append((iid, est, true_r))
+ ranked = {}
+ for uid, items in user_items.items():
+ ranked[uid] = sorted(items, key=lambda x: x[1], reverse=True)[:k]
+ return ranked
+
+def evaluate_all(predictions, testset, all_items, item_popularity, item_features, k_list=[10, 20]):
+ ranked_lists = predictions_to_ranked_lists(predictions, k=max(k_list))
+ results = {}
+ for k in k_list:
+ results[f'Precision@{k}'] = precision_at_k(ranked_lists, k)
+ results[f'Recall@{k}'] = recall_at_k(ranked_lists, testset, k)
+ results[f'NDCG@{k}'] = ndcg_at_k(ranked_lists, k)
+ results['Coverage'] = catalog_coverage(ranked_lists, all_items)
+ results['Novelty'] = novelty(ranked_lists, item_popularity)
+ results['Diversity'] = intra_list_diversity(ranked_lists, item_features)
+ return results
+
+def summarize_results(results_dict):
+ return pd.DataFrame(results_dict).T
+
+def bootstrap_metric(metric_func, predictions, testset, all_items, item_popularity, item_features, n_bootstrap=100, k=10):
+ scores = []
+ uids = list({p[0] for p in predictions})
+ for _ in range(n_bootstrap):
+ sampled_uids = np.random.choice(uids, size=len(uids), replace=True)
+ sampled_preds = [p for p in predictions if p[0] in sampled_uids]
+ ranked_lists = predictions_to_ranked_lists(sampled_preds, k)
+ score = metric_func(ranked_lists, k)
+ scores.append(score)
+ return np.percentile(scores, [2.5, 97.5])
+
diff --git a/src/feature_engineering.py b/src/feature_engineering.py
new file mode 100644
index 0000000000000000000000000000000000000000..83ae4d9dbf8c78d7a73c1239f1391e12bc66d914
--- /dev/null
+++ b/src/feature_engineering.py
@@ -0,0 +1,224 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+import os
+from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
+from sklearn.decomposition import TruncatedSVD
+
+class FeatureEngineering:
+ def __init__(self, dfs, interim_path="D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"):
+ self.merged_df = dfs["merged_df"]
+ self.ratings_df = dfs["ratings_df"]
+ self.interim_path = interim_path
+ os.makedirs(self.interim_path, exist_ok=True)
+
+ def ordering(self):
+ self.merged_df = self.merged_df.drop(columns=['id', 'tmdbId', 'imdbId', 'imdb_id', 'original_title', 'video'])
+ desired_column_order = [
+ 'movieId',
+ 'title',
+ 'release_date',
+ 'runtime',
+ 'status',
+ 'adult',
+ 'budget',
+ 'revenue',
+ 'popularity',
+ 'vote_average',
+ 'vote_count',
+ 'overview',
+ 'genres',
+ 'keywords',
+ 'cast',
+ 'crew',
+ 'production_companies',
+ 'production_countries',
+ 'original_language',
+ 'userId',
+ 'rating',
+ ]
+
+ self.merged_df = self.merged_df.reindex(columns=desired_column_order)
+
+ def outliers(self):
+ self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
+ self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
+ self.merged_df = self.merged_df[self.merged_df['runtime'] > 0]
+ self.merged_df = self.merged_df[self.merged_df['budget'] >= 0]
+ self.merged_df = self.merged_df[self.merged_df['revenue'] >= 0]
+
+ for col in ['budget', 'revenue']:
+ upper = self.merged_df[col].quantile(0.995)
+ self.merged_df = self.merged_df[self.merged_df[col] <= upper]
+
+ def add_budget_to_revenue_ratio(self):
+ self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
+ self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
+ self.merged_df['budget_to_revenue_ratio'] = self.merged_df.apply(
+ lambda row: row['budget'] / row['revenue'] if row['revenue'] > 0 else 0, axis=1
+ )
+
+ def add_top_genre_onehot(self, top_n=5):
+ genre_dummies = self.merged_df['genres'].str.get_dummies(sep=', ')
+ top_genres = genre_dummies.sum().sort_values(ascending=False).head(top_n).index
+ for genre in top_genres:
+ self.merged_df[f"genre_{genre}"] = genre_dummies[genre]
+
+ def add_log_features(self):
+ for col in ['budget', 'revenue', 'popularity', 'vote_count']:
+ self.merged_df[f'log_{col}'] = np.log1p(self.merged_df[col])
+
+ def add_interaction_features(self):
+ self.merged_df['budget_x_popularity'] = self.merged_df['budget'] * self.merged_df['popularity']
+ self.merged_df['budget_x_vote_count'] = self.merged_df['budget'] * self.merged_df['vote_count']
+
+ def add_count_features(self):
+ self.merged_df['num_genres'] = self.merged_df['genres'].fillna('').apply(lambda x: len([g for g in x.split(',') if g.strip()]))
+ self.merged_df['num_keywords'] = self.merged_df['keywords'].fillna('').apply(lambda x: len([k for k in x.split(',') if k.strip()]))
+ self.merged_df['num_cast'] = self.merged_df['cast'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
+ self.merged_df['num_crew'] = self.merged_df['crew'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
+
+ def add_text_length_features(self):
+ self.merged_df['overview_length'] = self.merged_df['overview'].fillna('').apply(len)
+ self.merged_df['title_length'] = self.merged_df['title'].fillna('').apply(len)
+
+ def add_genre_mean_encoding(self):
+ genre_ratings = {}
+ for genre in self.merged_df['genres'].str.split(',').explode().str.strip().unique():
+ if genre and genre != 'Unknown':
+ mask = self.merged_df['genres'].str.contains(rf'\b{genre}\b', regex=True)
+ genre_ratings[genre] = self.merged_df.loc[mask, 'vote_average'].mean()
+ for genre in list(genre_ratings.keys())[:10]:
+ self.merged_df[f'genre_{genre}_mean_vote'] = self.merged_df['genres'].apply(
+ lambda x: genre_ratings[genre] if genre in x else np.nan
+ )
+
+ def add_release_date_features(self):
+ self.merged_df['release_date'] = pd.to_datetime(self.merged_df['release_date'], errors='coerce')
+ self.merged_df['release_year'] = self.merged_df['release_date'].dt.year
+ self.merged_df.drop(columns=['release_date'], inplace=True)
+
+
+ def add_adult_flag(self):
+ if 'adult' in self.merged_df.columns:
+ self.merged_df['is_adult'] = self.merged_df['adult'].map({'True': 1, 'False': 0})
+ self.merged_df.drop(columns=['adult'], inplace=True)
+
+ def add_multi_hot_keywords(self, top_n=20):
+ keywords_split = self.merged_df['keywords'].fillna('').apply(lambda x: [k.strip() for k in x.split(',') if k.strip()])
+ mlb = MultiLabelBinarizer()
+ top_keywords = pd.Series([k for sublist in keywords_split for k in sublist]).value_counts().head(top_n).index
+ keywords_filtered = keywords_split.apply(lambda x: [k for k in x if k in top_keywords])
+ keyword_dummies = pd.DataFrame(mlb.fit_transform(keywords_filtered), columns=[f'kw_{k}' for k in mlb.classes_], index=self.merged_df.index)
+ self.merged_df = pd.concat([self.merged_df, keyword_dummies], axis=1)
+
+ def add_cast_crew_features(self, top_n_cast=5, top_n_crew=5):
+ cast_split = self.merged_df['cast'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
+ crew_split = self.merged_df['crew'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
+ mlb_cast = MultiLabelBinarizer()
+ mlb_crew = MultiLabelBinarizer()
+ top_cast = pd.Series([c for sublist in cast_split for c in sublist]).value_counts().head(top_n_cast).index
+ top_crew = pd.Series([c for sublist in crew_split for c in sublist]).value_counts().head(top_n_crew).index
+ cast_filtered = cast_split.apply(lambda x: [c for c in x if c in top_cast])
+ crew_filtered = crew_split.apply(lambda x: [c for c in x if c in top_crew])
+ cast_dummies = pd.DataFrame(mlb_cast.fit_transform(cast_filtered), columns=[f'cast_{c}' for c in mlb_cast.classes_], index=self.merged_df.index)
+ crew_dummies = pd.DataFrame(mlb_crew.fit_transform(crew_filtered), columns=[f'crew_{c}' for c in mlb_crew.classes_], index=self.merged_df.index)
+ self.merged_df = pd.concat([self.merged_df, cast_dummies, crew_dummies], axis=1)
+
+ def add_company_country_features(self, top_n_company=5, top_n_country=5):
+ company_split = self.merged_df['production_companies'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
+ country_split = self.merged_df['production_countries'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
+ mlb_company = MultiLabelBinarizer()
+ mlb_country = MultiLabelBinarizer()
+ top_company = pd.Series([c for sublist in company_split for c in sublist]).value_counts().head(top_n_company).index
+ top_country = pd.Series([c for sublist in country_split for c in sublist]).value_counts().head(top_n_country).index
+ company_filtered = company_split.apply(lambda x: [c for c in x if c in top_company])
+ country_filtered = country_split.apply(lambda x: [c for c in x if c in top_country])
+ company_dummies = pd.DataFrame(mlb_company.fit_transform(company_filtered), columns=[f'company_{c}' for c in mlb_company.classes_], index=self.merged_df.index)
+ country_dummies = pd.DataFrame(mlb_country.fit_transform(country_filtered), columns=[f'country_{c}' for c in mlb_country.classes_], index=self.merged_df.index)
+ self.merged_df = pd.concat([self.merged_df, company_dummies, country_dummies], axis=1)
+
+ def add_target_encoding(self, col, target='vote_average', top_n=10):
+ values = pd.Series([v for sublist in self.merged_df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',') if i.strip()]) for v in sublist])
+ top_values = values.value_counts().head(top_n).index
+ for v in top_values:
+ mask = self.merged_df[col].str.contains(rf'\b{v}\b', regex=True)
+ mean_val = self.merged_df.loc[mask, target].mean()
+ self.merged_df[f'{col}_{v}_mean_{target}'] = mask.astype(int) * mean_val
+
+ def coding(self):
+ self.add_target_encoding(col='genres')
+ self.add_target_encoding(col='production_companies')
+
+ def Tfidf(self):
+ tfidf_overview_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english')
+ tfidf_overview_matrix = tfidf_overview_vectorizer.fit_transform(self.merged_df['overview'].fillna(''))
+ self.tfidf_overview_df = pd.DataFrame(tfidf_overview_matrix.toarray(), columns=[f'overview_tfidf_{col}' for col in tfidf_overview_vectorizer.get_feature_names_out()], index=self.merged_df.index)
+
+ def merging_Tfidf(self):
+ # Combine the original dataframe with the TF-IDF features
+ self.merged_df_with_tfidf = pd.concat([self.merged_df, self.tfidf_overview_df], axis=1)
+
+ def presvd(self):
+ columns_for_svd = self.merged_df_with_tfidf.select_dtypes(include=np.number).columns.tolist()
+ columns_for_svd = [col for col in columns_for_svd if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year']] # Exclude non-feature columns and year
+
+ for col in columns_for_svd:
+ if self.merged_df_with_tfidf[col].isnull().any():
+ median_val = self.merged_df_with_tfidf[col].median()
+ self.merged_df_with_tfidf[col] = self.merged_df_with_tfidf[col].fillna(median_val)
+ if 'production_companies_Warner Bros._mean_vote_average' in self.merged_df_with_tfidf.columns:
+ self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'] = self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'].fillna(0)
+
+
+ def svd(self):
+ unique_movies_df = self.merged_df_with_tfidf.groupby('movieId').first().reset_index()
+ columns_for_svd_unique = unique_movies_df.select_dtypes(include=np.number).columns.tolist()
+ columns_for_svd_unique = [col for col in columns_for_svd_unique if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year', 'vote_average', 'vote_count']]
+
+ # Fill NaNs with median for all SVD columns
+ for col in columns_for_svd_unique:
+ if unique_movies_df[col].isnull().any():
+ median_val = unique_movies_df[col].median()
+ unique_movies_df[col] = unique_movies_df[col].fillna(median_val)
+ # Extra: fill any remaining NaNs with 0 (safety for SVD)
+ unique_movies_df[columns_for_svd_unique] = unique_movies_df[columns_for_svd_unique].fillna(0)
+
+ if 'production_companies_Warner Bros._mean_vote_average' in unique_movies_df.columns:
+ unique_movies_df['production_companies_Warner Bros._mean_vote_average'] = unique_movies_df['production_companies_Warner Bros._mean_vote_average'].fillna(0)
+
+
+ n_components = 150
+ svd = TruncatedSVD(n_components=n_components, random_state=42)
+ svd_matrix_unique = svd.fit_transform(unique_movies_df[columns_for_svd_unique])
+ svd_df_unique = pd.DataFrame(svd_matrix_unique, columns=[f'svd_{i+1}' for i in range(n_components)], index=unique_movies_df.index)
+ columns_to_drop_after_svd_unique = [col for col in columns_for_svd_unique if col not in ['vote_average', 'vote_count']]
+ self.unique_movies_reduced = unique_movies_df.drop(columns=columns_to_drop_after_svd_unique).copy()
+ self.unique_movies_reduced = pd.concat([self.unique_movies_reduced, svd_df_unique], axis=1)
+
+ def run_all(self):
+ self.ordering()
+ self.outliers()
+ self.add_budget_to_revenue_ratio()
+ self.add_top_genre_onehot()
+ self.add_log_features()
+ self.add_interaction_features()
+ self.add_count_features()
+ self.add_text_length_features()
+ self.add_genre_mean_encoding()
+ self.add_release_date_features()
+ self.add_adult_flag()
+ self.add_multi_hot_keywords()
+ self.add_cast_crew_features()
+ self.add_company_country_features()
+ self.coding()
+ self.Tfidf()
+ self.merging_Tfidf()
+ self.presvd()
+ self.svd()
+
+ return {
+ "merged_df": self.merged_df,
+ "merged_df_with_tfidf": self.merged_df_with_tfidf,
+ "unique_movies_reduced": self.unique_movies_reduced
+ }
diff --git a/src/modeling.py b/src/modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..44e0a5a6827b39aeb256e95f015d33470078929e
--- /dev/null
+++ b/src/modeling.py
@@ -0,0 +1,194 @@
+import numpy as np
+import pandas as pd
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.metrics import mean_squared_error
+from surprise import Dataset, Reader, KNNBasic, SVD, accuracy
+from surprise.model_selection import train_test_split, GridSearchCV
+import joblib
+
+class RecommenderModels:
+ def __init__(self, merged_df_with_tfidf, unique_movies_reduced, ratings_df):
+ self.merged_df_with_tfidf = merged_df_with_tfidf
+ self.unique_movies_reduced = unique_movies_reduced
+ self.ratings_df = ratings_df
+ self.popular_movies_unique = None
+ self.user_profiles = None
+ self.knn_user_based = None
+ self.svd_mf = None
+ self.svd_mf_tuned = None
+ self.best_alpha = None
+ self.model_dir = "models"
+ import os
+ os.makedirs(self.model_dir, exist_ok=True)
+
+ # ---------- Popularity Baseline ----------
+ def fit_popularity(self):
+ C = self.unique_movies_reduced['vote_average'].mean()
+ m = self.unique_movies_reduced['vote_count'].quantile(0.90)
+ qualified = self.unique_movies_reduced[self.unique_movies_reduced['vote_count'] >= m].copy()
+ def weighted_rating(x):
+ v, R = x['vote_count'], x['vote_average']
+ return (v / (v + m) * R) + (m / (v + m) * C)
+ qualified['weighted_rating'] = qualified.apply(weighted_rating, axis=1)
+ popular = qualified.sort_values('weighted_rating', ascending=False)
+ self.popular_movies_unique = popular.groupby('movieId').first().reset_index()
+
+ # ---------- Content-Based ----------
+ def fit_content_based(self):
+ movie_id_to_index = pd.Series(self.unique_movies_reduced.index, index=self.unique_movies_reduced['movieId']).to_dict()
+ svd_features = self.unique_movies_reduced.filter(like='svd_')
+ self.user_profiles = {}
+ for user_id in self.unique_movies_reduced['userId'].unique():
+ user_ratings = self.unique_movies_reduced[self.unique_movies_reduced['userId'] == user_id][['movieId', 'rating']]
+ profile = np.zeros(svd_features.shape[1])
+ total_weight = 0
+ for _, row in user_ratings.iterrows():
+ idx = movie_id_to_index.get(int(row['movieId']))
+ if idx is not None:
+ profile += svd_features.loc[idx].values * row['rating']
+ total_weight += row['rating']
+ if total_weight > 0:
+ profile /= total_weight
+ self.user_profiles[user_id] = profile
+
+ def get_content_based_recommendations(self, user_id, top_n=10):
+ if self.user_profiles is None:
+ raise ValueError("Call fit_content_based() first.")
+ if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
+ if self.popular_movies_unique is not None:
+ return self.popular_movies_unique[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(top_n)
+ return pd.DataFrame()
+ user_profile = self.user_profiles[user_id]
+ svd_features = self.unique_movies_reduced.filter(like='svd_')
+ sim_scores = cosine_similarity(user_profile.reshape(1, -1), svd_features)[0]
+ rated_ids = self.merged_df_with_tfidf[self.merged_df_with_tfidf['userId'] == user_id]['movieId'].tolist()
+ indices = [i for i, row in self.unique_movies_reduced.iterrows() if row['movieId'] not in rated_ids]
+ top_indices = np.argsort(sim_scores[indices])[::-1][:top_n]
+ recs = self.unique_movies_reduced.iloc[[indices[i] for i in top_indices]][['title', 'vote_average', 'vote_count']]
+ return recs.reset_index(drop=True)
+
+ def get_content_based_score(self, user_id, movie_id):
+ if self.user_profiles is None:
+ raise ValueError("Call fit_content_based() first.")
+ if user_id not in self.user_profiles or np.all(self.user_profiles[user_id] == 0):
+ return 0.0
+ user_profile = self.user_profiles[user_id]
+ idx = self.unique_movies_reduced[self.unique_movies_reduced['movieId'] == movie_id].index
+ if idx.empty:
+ return 0.0
+ movie_features = self.unique_movies_reduced.loc[idx].filter(like='svd_').values
+ return cosine_similarity(user_profile.reshape(1, -1), movie_features.reshape(1, -1))[0][0]
+
+ # ---------- Collaborative Filtering ----------
+ def fit_cf(self):
+ reader = Reader(rating_scale=(1, 5))
+ data = Dataset.load_from_df(self.ratings_df[['userId', 'movieId', 'rating']], reader)
+ self.data = data
+ self.trainset, self.testset = train_test_split(data, test_size=0.2, random_state=42)
+ self.knn_user_based = KNNBasic(sim_options={'user_based': True, 'similarity': 'cosine'}, k=40)
+ self.knn_user_based.fit(self.trainset)
+ self.svd_mf = SVD(random_state=42)
+ self.svd_mf.fit(self.trainset)
+
+ def evaluate_cf(self):
+ preds_knn = self.knn_user_based.test(self.testset)
+ preds_svd = self.svd_mf.test(self.testset)
+ rmse_knn = accuracy.rmse(preds_knn)
+ rmse_svd = accuracy.rmse(preds_svd)
+ return rmse_knn, rmse_svd
+
+ # ---------- Hybrid Model ----------
+ def hybrid_prediction(self, user_id, movie_id, alpha):
+ cb_score = self.get_content_based_score(user_id, movie_id)
+ try:
+ cf1_pred = self.knn_user_based.predict(str(user_id), str(movie_id)).est
+ except Exception:
+ cf1_pred = 0
+ try:
+ cf2_pred = self.svd_mf.predict(str(user_id), str(movie_id)).est
+ except Exception:
+ cf2_pred = 0
+ cf_score = (cf1_pred + cf2_pred) / 2.0
+ return alpha * cf_score + (1 - alpha) * cb_score
+
+ def tune_hybrid_alpha(self, alphas=None):
+ if alphas is None:
+ alphas = np.arange(0, 1.01, 0.5)
+ testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
+ # Recreate user profiles from trainset
+ train_ratings_df = pd.DataFrame(self.trainset.all_ratings(), columns=['uid', 'iid', 'rating'])
+ train_ratings_df['userId'] = train_ratings_df['uid'].apply(lambda x: self.trainset.to_raw_uid(x))
+ train_ratings_df['movieId'] = train_ratings_df['iid'].apply(lambda x: self.trainset.to_raw_iid(x))
+ train_ratings_df = train_ratings_df[['userId', 'movieId', 'rating']]
+ self.fit_content_based() # Ensure user_profiles is up to date
+ rmse_scores = {}
+ for alpha in alphas:
+ preds, actuals = [], []
+ for _, row in testset_df.iterrows():
+ pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), alpha)
+ preds.append(pred)
+ actuals.append(row['rating'])
+ rmse = np.sqrt(mean_squared_error(actuals, preds))
+ rmse_scores[alpha] = rmse
+ self.best_alpha = min(rmse_scores, key=rmse_scores.get)
+ return rmse_scores, self.best_alpha
+
+ def fit_svd_gridsearch(self, param_grid=None):
+ if param_grid is None:
+ param_grid = {
+ 'n_factors': [50, 100, 150],
+ 'lr_all': [0.002, 0.005, 0.01],
+ 'reg_all': [0.02, 0.05, 0.1]
+ }
+ gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
+ gs.fit(self.data)
+ self.svd_mf_tuned = SVD(**gs.best_params['rmse'])
+ self.svd_mf_tuned.fit(self.data.build_full_trainset())
+ return gs.best_score['rmse'], gs.best_params['rmse']
+
+ def evaluate_hybrid(self):
+ testset_df = pd.DataFrame(self.testset, columns=['userId', 'movieId', 'rating'])
+ preds, actuals = [], []
+ for _, row in testset_df.iterrows():
+ pred = self.hybrid_prediction(int(row['userId']), int(row['movieId']), self.best_alpha)
+ preds.append(pred)
+ actuals.append(row['rating'])
+ rmse = np.sqrt(mean_squared_error(actuals, preds))
+ return rmse
+
+ def save_models(self, prefix="recommender"):
+ # Save collaborative models
+ joblib.dump(self.knn_user_based, f"{self.model_dir}/{prefix}_knn_user_based.pkl")
+ joblib.dump(self.svd_mf, f"{self.model_dir}/{prefix}_svd_mf.pkl")
+ if self.svd_mf_tuned is not None:
+ joblib.dump(self.svd_mf_tuned, f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
+ # Save user profiles and other numpy/pandas objects
+ joblib.dump(self.user_profiles, f"{self.model_dir}/{prefix}_user_profiles.pkl")
+ joblib.dump(self.popular_movies_unique, f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
+ joblib.dump(self.unique_movies_reduced, f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
+ joblib.dump(self.merged_df_with_tfidf, f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
+ print(f"Models and data saved to {self.model_dir}/")
+
+ def load_models(self, prefix="recommender"):
+ # Load collaborative models
+ self.knn_user_based = joblib.load(f"{self.model_dir}/{prefix}_knn_user_based.pkl")
+ self.svd_mf = joblib.load(f"{self.model_dir}/{prefix}_svd_mf.pkl")
+ try:
+ self.svd_mf_tuned = joblib.load(f"{self.model_dir}/{prefix}_svd_mf_tuned.pkl")
+ except Exception:
+ self.svd_mf_tuned = None
+ self.user_profiles = joblib.load(f"{self.model_dir}/{prefix}_user_profiles.pkl")
+ self.popular_movies_unique = joblib.load(f"{self.model_dir}/{prefix}_popular_movies_unique.pkl")
+ self.unique_movies_reduced = joblib.load(f"{self.model_dir}/{prefix}_unique_movies_reduced.pkl")
+ self.merged_df_with_tfidf = joblib.load(f"{self.model_dir}/{prefix}_merged_df_with_tfidf.pkl")
+ print(f"Models and data loaded from {self.model_dir}/")
+
+# Example usage:
+# models = RecommenderModels(merged_df_with_tfidf, unique_movies_reduced, ratings_df)
+# models.fit_popularity()
+# models.fit_content_based()
+# models.fit_cf()
+# print(models.evaluate_cf())
+# rmse_scores, best_alpha = models.tune_hybrid_alpha()
+# print("Best alpha:", best_alpha)
+# print("Hybrid RMSE:", models.evaluate_hybrid())
\ No newline at end of file
diff --git a/src/preprocessing.py b/src/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe7ee47eb21a7783520213e3df8455faf73c8414
--- /dev/null
+++ b/src/preprocessing.py
@@ -0,0 +1,191 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import warnings
+import missingno as msno
+import ast
+warnings.filterwarnings('ignore')
+
+class Preprocessing:
+ def __init__(self):
+ self.main_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/raw/"
+ self.movies_metadata_path = self.main_path + "movies_metadata.csv"
+ self.credits_path = self.main_path + "credits.csv"
+ self.keywords_path = self.main_path + "keywords.csv"
+ self.links_path = self.main_path + "links_small.csv"
+ self.ratings_path = self.main_path + "ratings_small.csv"
+ self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
+ self.interim_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"
+ self.proceed_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/data/processed/"
+
+ def load_data(self):
+ self.df = pd.read_csv(self.movies_metadata_path)
+ self.credits_df = pd.read_csv(self.credits_path)
+ self.keywords_df = pd.read_csv(self.keywords_path)
+ self.links_df = pd.read_csv(self.links_path)
+ self.ratings_df = pd.read_csv(self.ratings_path)
+
+ def df_missing_value(self):
+ # saving missing value pic
+ ax = msno.matrix(self.df)
+ fig = ax.get_figure()
+ fig.savefig(self.img_path + "df_missing.png", dpi=300, bbox_inches='tight')
+ plt.close(fig)
+ # Analyze missing value percentages from missing_df_info
+ missing_df = self.df.isnull().sum().sort_values(ascending=False)
+ missing_df_percent = (missing_df / len(self.df)) * 100
+ missing_df_info = pd.DataFrame({'Missing Count': missing_df, 'Missing Percentage (%)': missing_df_percent})
+
+ high_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] > 50].index.tolist()
+ moderate_missing_cols = missing_df_info[(missing_df_info['Missing Percentage (%)'] <= 50) & (missing_df_info['Missing Percentage (%)'] > 1)].index.tolist()
+ low_missing_cols = missing_df_info[missing_df_info['Missing Percentage (%)'] <= 1].index.tolist()
+ handling_strategy = {}
+
+ for col in high_missing_cols:
+ handling_strategy[col] = "Drop column due to high missing percentage"
+
+ if 'overview' in moderate_missing_cols:
+ handling_strategy['overview'] = "Consider dropping or using a placeholder for text data"
+
+ numerical_low_missing = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
+ for col in low_missing_cols:
+ if col in numerical_low_missing:
+ handling_strategy[col] = "Impute with mean or median"
+ elif col in ['status', 'release_date', 'imdb_id', 'original_language', 'title', 'video', 'spoken_languages', 'production_countries', 'production_companies', 'poster_path']:
+ handling_strategy[col] = "Consider imputation (e.g., mode, placeholder) or dropping rows"
+
+
+ handling_strategy['adult'] = "Investigate and potentially remove incorrect entries (e.g., 'R')"
+
+ # 1. Drop columns with high missing percentages
+ cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline']
+ self.df = self.df.drop(columns=cols_to_drop)
+
+ # 2. Fill missing values in 'overview' with a placeholder
+ self.df['overview'] = self.df['overview'].fillna('No overview available')
+
+ self.df['popularity'] = pd.to_numeric(self.df['popularity'], errors='coerce')
+
+ numerical_cols_to_impute = ['runtime', 'vote_average', 'vote_count', 'revenue', 'popularity']
+ for col in numerical_cols_to_impute:
+ if col in self.df.columns:
+ median_val = self.df[col].median()
+ self.df[col] = self.df[col].fillna(median_val)
+
+ remaining_missing_cols = self.df.columns[self.df.isnull().any()].tolist()
+ cols_to_fill_unknown = [col for col in remaining_missing_cols if col not in numerical_cols_to_impute and col != 'adult']
+
+ for col in cols_to_fill_unknown:
+ self.df[col] = self.df[col].fillna('Unknown')
+
+ # Investigate and remove incorrect entries in 'adult' column
+ self.df = self.df[self.df['adult'].isin(['True', 'False'])]
+
+ def extract_names_and_handle_empty(self, json_list_string):
+ """Extracts names from a string representation of a list of dictionaries and handles empty lists as NaN."""
+ if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
+ try:
+ data_list = ast.literal_eval(json_list_string)
+ if isinstance(data_list, list):
+ if not data_list:
+ return np.nan
+ names = [item.get('name', '') for item in data_list if isinstance(item, dict) and 'name' in item]
+ return ', '.join(names)
+ except (ValueError, SyntaxError):
+ pass
+ return ''
+
+
+ def extract_names_from_list(self,json_list_string, key='name'):
+ """Extracts values for a given key from a string representation of a list of dictionaries and handles empty lists as NaN."""
+ if isinstance(json_list_string, str) and json_list_string.startswith('[') and json_list_string.endswith(']'):
+ try:
+ data_list = ast.literal_eval(json_list_string)
+ if isinstance(data_list, list):
+ if not data_list:
+ return np.nan
+ names = [item.get(key, '') for item in data_list if isinstance(item, dict) and key in item]
+ return ', '.join(names)
+ except (ValueError, SyntaxError):
+ pass
+ return ''
+
+ def clean_data(self):
+ # Drop null values in links_df
+ self.links_df = self.links_df.dropna(subset=['tmdbId'])
+ # Handling data types
+ self.df['id'] = pd.to_numeric(self.df['id'], errors='coerce')
+ self.df.dropna(subset=['id'], inplace=True)
+ self.df['id'] = self.df['id'].astype(int)
+
+ self.links_df.dropna(subset=['tmdbId'], inplace=True)
+ self.links_df['tmdbId'] = self.links_df['tmdbId'].astype(int)
+
+ copy_df = self.df.copy()
+ # Columns identified as containing JSON format
+ json_columns = ['genres', 'production_companies', 'production_countries', 'spoken_languages']
+
+ # Apply the extraction function and handle empty lists to copy_df
+ for col in json_columns:
+ if col in copy_df.columns:
+ copy_df[col] = copy_df[col].apply(self.extract_names_and_handle_empty)
+ # Now fill the NaN values (which were empty lists) with 'Unknown'
+ copy_df[col] = copy_df[col].fillna('Unknown')
+ self.df = copy_df.copy()
+
+ # Handle JSON columns in credits_df
+ # 'cast' and 'crew' columns contain lists of dictionaries, we'll extract names
+ self.credits_df['cast'] = self.credits_df['cast'].apply(self.extract_names_from_list, key='name')
+ self.credits_df['crew'] = self.credits_df['crew'].apply(self.extract_names_from_list, key='name')
+
+ # Fill NaN values (originally empty lists) with 'Unknown' in credits_df
+ self.credits_df['cast'] = self.credits_df['cast'].fillna('Unknown')
+ self.credits_df['crew'] = self.credits_df['crew'].fillna('Unknown')
+
+ # Handle JSON columns in keywords_df
+ # 'keywords' column contains a list of dictionaries, we'll extract names
+ self.keywords_df['keywords'] = self.keywords_df['keywords'].apply(self.extract_names_from_list, key='name')
+
+ # Fill NaN values (originally empty lists) with 'Unknown' in self.keywords_df
+ self.keywords_df['keywords'] = self.keywords_df['keywords'].fillna('Unknown')
+
+ # Remove duplicates from key columns
+ self.df.drop_duplicates(subset=['id'], inplace=True)
+ self.credits_df.drop_duplicates(subset=['id'], inplace=True)
+ self.keywords_df.drop_duplicates(subset=['id'], inplace=True)
+ self.links_df.drop_duplicates(subset=['tmdbId'], inplace=True)
+ self.ratings_df.drop_duplicates(subset=['movieId', 'userId'], inplace=True)
+
+
+ def merge_data(self):
+ # Merge df, credits_df, and keywords_df on 'id'
+ self.merged_df = pd.merge(self.df, self.credits_df, on='id', how='inner')
+ self.merged_df = pd.merge(self.merged_df, self.keywords_df, on='id', how='inner')
+ # Merge with links_df using tmdbId from links_df and id from self.merged_df
+ self.merged_df = pd.merge(self.merged_df, self.links_df, left_on='id', right_on='tmdbId', how='inner')
+ # Do NOT merge with ratings_df here! Only merge for modeling step.
+
+ def generate_interim_va_proceed_csv(self):
+ # Save cleaned DataFrames to interim CSV files
+ self.df.to_csv(self.interim_path + "movies_metadata_clean.csv", index=False)
+ self.credits_df.to_csv(self.interim_path + "credits_clean.csv", index=False)
+ self.keywords_df.to_csv(self.interim_path + "keywords_clean.csv", index=False)
+ self.links_df.to_csv(self.interim_path + "links_clean.csv", index=False)
+ self.ratings_df.to_csv(self.interim_path + "ratings_clean.csv", index=False)
+ self.merged_df.to_csv(self.proceed_path + "merged_clean.csv", index=False)
+
+ def run_all(self):
+ self.load_data()
+ self.df_missing_value()
+ self.clean_data()
+ self.merge_data()
+ self.generate_interim_va_proceed_csv()
+ return {
+ "df": self.df,
+ "credits_df": self.credits_df,
+ "keywords_df": self.keywords_df,
+ "links_df": self.links_df,
+ "ratings_df": self.ratings_df,
+ "merged_df": self.merged_df
+ }
\ No newline at end of file
| |