Bardi-ya commited on
Commit
c296592
·
verified ·
1 Parent(s): 49a962c

Upload 51 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +75 -13
  3. app.py +63 -0
  4. app/practical.py +115 -0
  5. document.pdf +0 -0
  6. models/recommender_knn_user_based.pkl +3 -0
  7. models/recommender_merged_df_with_tfidf.pkl +3 -0
  8. models/recommender_popular_movies_unique.pkl +3 -0
  9. models/recommender_svd_mf.pkl +3 -0
  10. models/recommender_unique_movies_reduced.pkl +3 -0
  11. models/recommender_user_profiles.pkl +3 -0
  12. notebooks/practical.ipynb +0 -0
  13. report/images/budget_vs_revenue.png +0 -0
  14. report/images/budget_vs_revenue_filtered.png +0 -0
  15. report/images/df_missing.png +3 -0
  16. report/images/movies_by_decade_pie.png +0 -0
  17. report/images/popularity_distribution.png +0 -0
  18. report/images/popularity_distribution_lt10.png +0 -0
  19. report/images/popularity_distribution_lt100.png +0 -0
  20. report/images/rating_distribution.png +0 -0
  21. report/images/release_year_distribution.png +0 -0
  22. report/images/runtime_distribution.png +0 -0
  23. report/images/top_genres.png +0 -0
  24. report/images/top_languages.png +0 -0
  25. report/images/top_production_companies.png +0 -0
  26. report/images/top_production_countries.png +0 -0
  27. report/images/vote_average_distribution.png +0 -0
  28. report/images/vote_count_distribution.png +0 -0
  29. report/images/vote_count_vs_average.png +0 -0
  30. report/images/wordcloud_overview.png +3 -0
  31. report/images/wordcloud_title.png +3 -0
  32. report/images/world_production_map.png +0 -0
  33. requirements.txt +12 -0
  34. src/__pycache__/collaborative.cpython-310.pyc +0 -0
  35. src/__pycache__/collaborative.cpython-313.pyc +0 -0
  36. src/__pycache__/content_based.cpython-310.pyc +0 -0
  37. src/__pycache__/content_based.cpython-313.pyc +0 -0
  38. src/__pycache__/eda.cpython-310.pyc +0 -0
  39. src/__pycache__/eda.cpython-313.pyc +0 -0
  40. src/__pycache__/evaluation.cpython-310.pyc +0 -0
  41. src/__pycache__/feature_engineering.cpython-310.pyc +0 -0
  42. src/__pycache__/feature_engineering.cpython-313.pyc +0 -0
  43. src/__pycache__/hybrid.cpython-310.pyc +0 -0
  44. src/__pycache__/modeling.cpython-310.pyc +0 -0
  45. src/__pycache__/modeling.cpython-313.pyc +0 -0
  46. src/__pycache__/preprocessing.cpython-310.pyc +0 -0
  47. src/__pycache__/preprocessing.cpython-313.pyc +0 -0
  48. src/eda.py +327 -0
  49. src/evaluation.py +121 -0
  50. src/feature_engineering.py +224 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ report/images/df_missing.png filter=lfs diff=lfs merge=lfs -text
37
+ report/images/wordcloud_overview.png filter=lfs diff=lfs merge=lfs -text
38
+ report/images/wordcloud_title.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,75 @@
1
- ---
2
- title: Final ML Project
3
- emoji: 🏆
4
- colorFrom: purple
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.44.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MovieLens Movie Data Analysis
2
+
3
+ This project provides a reproducible pipeline for preprocessing and exploratory data analysis (EDA) on the MovieLens movie dataset.
4
+
5
+ ## Project Structure
6
+
7
+ ```
8
+ .
9
+ ├── app/
10
+ │ └── Practical.py # Main entry point for running the pipeline
11
+ ├── src/
12
+ │ ├── preprocessing.py # Data loading, cleaning, merging
13
+ │ └── eda.py # EDA and visualization (plots saved to /report/images)
14
+ ├── notebooks/
15
+ │ └── Practical.ipynb # Step-by-step notebook for exploration and prototyping
16
+ ├── report/
17
+ │ └── images/ # Output directory for all generated plots and images
18
+ ├── data/
19
+ │ ├── raw/ # Raw input data (CSV files)
20
+ │ ├── interim/ # Cleaned/intermediate CSVs
21
+ │ └── processed/ # (Optional) Final processed data
22
+ ├── requirements.txt # Python dependencies
23
+ └── README.md # This file
24
+ ```
25
+
26
+ ## How to Run
27
+
28
+ 1. **Install dependencies**
29
+ Make sure you have Python 3.8+ and run:
30
+ ```
31
+ pip install -r requirements.txt
32
+ ```
33
+
34
+ 2. **Prepare data**
35
+ Place the raw MovieLens CSV files in `data/raw/` as:
36
+ - `movies_metadata.csv`
37
+ - `credits.csv`
38
+ - `keywords.csv`
39
+ - `links.csv`
40
+ - `ratings.csv`
41
+
42
+ 3. **Run the pipeline**
43
+ ```
44
+ python app/Practical.py
45
+ ```
46
+ This will:
47
+ - Clean and merge the data
48
+ - Save interim cleaned CSVs to `data/interim/`
49
+ - Generate all EDA plots and wordclouds, saving them to `report/images/`
50
+ - Save interactive Plotly plots as PNG (requires [kaleido](https://github.com/plotly/Kaleido)) or HTML fallback
51
+
52
+ ## Features
53
+
54
+ - **Modular Preprocessing**: All data cleaning, merging, and type handling in `src/preprocessing.py`
55
+ - **Automated EDA**: All plots and wordclouds generated and saved by `src/eda.py`
56
+ - **Reproducibility**: One-command run for the entire workflow
57
+ - **Notebook**: `notebooks/Practical.ipynb` for step-by-step exploration
58
+
59
+ ## Requirements
60
+
61
+ - pandas
62
+ - numpy
63
+ - matplotlib
64
+ - seaborn
65
+ - missingno
66
+ - wordcloud
67
+ - plotly
68
+ - pycountry
69
+ - kaleido (for static plotly image export)
70
+
71
+ ## Notes
72
+
73
+ - If static Plotly image export fails, HTML versions of the plots are saved as a fallback.
74
+ - All output images are saved in `report/images/`.
75
+ - Adjust paths in `src/eda.py` and `src/preprocessing.py` if your
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pickle
3
+ import pandas as pd
4
+ import os
5
+
6
+ # Paths
7
+ MODEL_DIR = "models"
8
+ MOVIE_DATA_PATH = "data/movies.csv" # adjust to your actual metadata file
9
+
10
+ # Load models (choose what you want to demo)
11
+ with open(os.path.join(MODEL_DIR, "recommender_svd_mf.pkl"), "rb") as f:
12
+ svd_model = pickle.load(f)
13
+
14
+ # Load movie metadata
15
+ movies_df = pd.read_csv(MOVIE_DATA_PATH) # should include [movieId, title, poster_url, actors]
16
+
17
+ def recommend(user_id, top_k=5):
18
+ """Generate top-k recommendations using SVD model."""
19
+ # Predict scores for all movies for this user
20
+ all_movie_ids = movies_df["movieId"].unique()
21
+ predictions = []
22
+ for mid in all_movie_ids:
23
+ try:
24
+ est = svd_model.predict(str(user_id), str(mid)).est
25
+ predictions.append((mid, est))
26
+ except Exception:
27
+ continue
28
+
29
+ # Sort and pick top_k
30
+ top_movies = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_k]
31
+
32
+ # Build output
33
+ results = []
34
+ for mid, score in top_movies:
35
+ row = movies_df[movies_df["movieId"] == mid].iloc[0]
36
+ explanation = f"Because you liked movies with {row.get('actors', 'similar style')}."
37
+ results.append((row["title"], row.get("poster_url", None), explanation))
38
+
39
+ return results
40
+
41
+ def format_output(results):
42
+ titles = [r[0] for r in results]
43
+ posters = [r[1] for r in results if r[1] is not None]
44
+ explanations = [r[2] for r in results]
45
+ return titles, posters, explanations
46
+
47
+ demo = gr.Interface(
48
+ fn=lambda user_id, k: format_output(recommend(user_id, k)),
49
+ inputs=[
50
+ gr.Number(label="User ID"),
51
+ gr.Slider(1, 10, value=5, step=1, label="Top-K")
52
+ ],
53
+ outputs=[
54
+ gr.Textbox(label="Recommended Movies"),
55
+ gr.Gallery(label="Posters").style(grid=[3], height="auto"),
56
+ gr.Textbox(label="Explanations")
57
+ ],
58
+ title="Movie Recommender System",
59
+ description="Enter your User ID to get top-K movie recommendations with posters and explanations."
60
+ )
61
+
62
+ if __name__ == "__main__":
63
+ demo.launch()
app/practical.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Add the parent directory to sys.path so 'src' can be imported
5
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
+
7
+ from src.preprocessing import Preprocessing
8
+ from src.eda import EDA
9
+ from src.feature_engineering import FeatureEngineering
10
+ from src.modeling import RecommenderModels
11
+ from src.evaluation import leave_one_out_by_timestamp, evaluate_all, summarize_results
12
+
13
+ def main():
14
+ print("========== Step 1: Preprocessing ==========")
15
+ preprocessor = Preprocessing()
16
+ dfs = preprocessor.run_all()
17
+
18
+ # print("========== Step 2: Exploratory Data Analysis (EDA) ==========")
19
+ # eda = EDA(dfs)
20
+ # eda.run_all()
21
+
22
+ print("========== Step 3: Feature Engineering ==========")
23
+ fe = FeatureEngineering(dfs)
24
+ fe_outputs = fe.run_all()
25
+ merged_df = fe_outputs["merged_df"]
26
+ merged_df_with_tfidf = fe_outputs["merged_df_with_tfidf"]
27
+ unique_movies_reduced = fe_outputs["unique_movies_reduced"]
28
+ ratings_df = dfs["ratings_df"]
29
+
30
+ print("========== Step 4: Modeling & Recommendation ==========")
31
+ models = RecommenderModels(
32
+ merged_df_with_tfidf=merged_df_with_tfidf,
33
+ unique_movies_reduced=unique_movies_reduced,
34
+ ratings_df=ratings_df
35
+ )
36
+ models.fit_popularity()
37
+ models.fit_content_based()
38
+ models.fit_cf()
39
+ print("CF RMSEs (kNN, SVD):", models.evaluate_cf())
40
+ rmse_scores, best_alpha = models.tune_hybrid_alpha()
41
+ print("Best alpha:", best_alpha)
42
+ print("Hybrid RMSE:", models.evaluate_hybrid())
43
+ models.save_models()
44
+ # Example: get recommendations for user 1
45
+ print("Top 10 Content-Based Recommendations for user 1:")
46
+ print(models.get_content_based_recommendations(user_id=1, top_n=10))
47
+
48
+ print("========== Step 5: Evaluation ==========")
49
+ # Time-aware split
50
+ train_ratings, test_ratings = leave_one_out_by_timestamp(ratings_df)
51
+ all_items = set(merged_df_with_tfidf['movieId'].astype(str).unique())
52
+ item_popularity = merged_df_with_tfidf['movieId'].value_counts().to_dict()
53
+ item_popularity = {str(k): v for k, v in item_popularity.items()}
54
+ svd_cols = [col for col in unique_movies_reduced.columns if col.startswith("svd_")]
55
+ item_features = {
56
+ str(row.movieId): row[svd_cols].values
57
+ for _, row in unique_movies_reduced.iterrows()
58
+ }
59
+
60
+ # Generate predictions for each model
61
+ # Implement prediction methods if not present in RecommenderModels
62
+ def predict_content_based(models, test_df):
63
+ preds = []
64
+ for _, row in test_df.iterrows():
65
+ user_id = row['userId']
66
+ movie_id = row['movieId']
67
+ true_rating = row['rating']
68
+ pred_rating = models.get_content_based_score(user_id, movie_id)
69
+ preds.append((user_id, movie_id, true_rating, pred_rating, {}))
70
+ return preds
71
+
72
+ def predict_collaborative(models, test_df):
73
+ preds = []
74
+ for _, row in test_df.iterrows():
75
+ user_id = row['userId']
76
+ movie_id = row['movieId']
77
+ true_rating = row['rating']
78
+ # Use SVD as the collaborative model (or knn_user_based if you prefer)
79
+ try:
80
+ pred_rating = models.svd_mf.predict(str(user_id), str(movie_id)).est
81
+ except Exception:
82
+ pred_rating = 0
83
+ preds.append((user_id, movie_id, true_rating, pred_rating, {}))
84
+ return preds
85
+
86
+ def predict_hybrid(models, test_df, alpha):
87
+ preds = []
88
+ for _, row in test_df.iterrows():
89
+ user_id = row['userId']
90
+ movie_id = row['movieId']
91
+ true_rating = row['rating']
92
+ pred_rating = models.hybrid_prediction(user_id, movie_id, alpha)
93
+ preds.append((user_id, movie_id, true_rating, pred_rating, {}))
94
+ return preds
95
+
96
+ predictions_cb = predict_content_based(models, test_ratings)
97
+ predictions_cf = predict_collaborative(models, test_ratings)
98
+ predictions_hybrid = predict_hybrid(models, test_ratings, alpha=best_alpha)
99
+
100
+ # Evaluate
101
+ results_cb = evaluate_all(predictions_cb, test_ratings.values, all_items, item_popularity, item_features)
102
+ results_cf = evaluate_all(predictions_cf, test_ratings.values, all_items, item_popularity, item_features)
103
+ results_hybrid = evaluate_all(predictions_hybrid, test_ratings.values, all_items, item_popularity, item_features)
104
+
105
+ # Print summary table
106
+ summary = summarize_results({
107
+ "Content-Based": results_cb,
108
+ "Collaborative": results_cf,
109
+ "Hybrid": results_hybrid
110
+ })
111
+ print(summary)
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
document.pdf ADDED
Binary file (68.8 kB). View file
 
models/recommender_knn_user_based.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2691486ff062618b6cc06aa00397ce7abc72d84a0f2f015e24d1c720ef9a6b
3
+ size 5949691
models/recommender_merged_df_with_tfidf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c0ea5f78039d884abaf095cf23c3320976d8db9865e522136ae27a375c89662
3
+ size 166955859
models/recommender_popular_movies_unique.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:095a203a1742406085858fc637e83a6f94e1860ac686c572ec75a2cae80511f6
3
+ size 2922773
models/recommender_svd_mf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29962c5fd250f73d6d2003b88a13bc2b0ee452c93fa44ee2f69fcb410a2f8770
3
+ size 9661411
models/recommender_unique_movies_reduced.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddc6fc18ff578c47c86ff53abf985f7af9a74146b5fe265666ad99844b530c85
3
+ size 21384963
models/recommender_user_profiles.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd803357ddf7dac92ecd6351beb6b980b56d7a514c540a78fa6261965170c68
3
+ size 1490
notebooks/practical.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
report/images/budget_vs_revenue.png ADDED
report/images/budget_vs_revenue_filtered.png ADDED
report/images/df_missing.png ADDED

Git LFS Details

  • SHA256: 4a62aab31beaa4b2578bb615811000c865e963e48f875fb32be1a907faad33b6
  • Pointer size: 131 Bytes
  • Size of remote file: 474 kB
report/images/movies_by_decade_pie.png ADDED
report/images/popularity_distribution.png ADDED
report/images/popularity_distribution_lt10.png ADDED
report/images/popularity_distribution_lt100.png ADDED
report/images/rating_distribution.png ADDED
report/images/release_year_distribution.png ADDED
report/images/runtime_distribution.png ADDED
report/images/top_genres.png ADDED
report/images/top_languages.png ADDED
report/images/top_production_companies.png ADDED
report/images/top_production_countries.png ADDED
report/images/vote_average_distribution.png ADDED
report/images/vote_count_distribution.png ADDED
report/images/vote_count_vs_average.png ADDED
report/images/wordcloud_overview.png ADDED

Git LFS Details

  • SHA256: b9ee4349be9564ed7c5161b5ab442159fa67031589eb87b84742a5f71a8be378
  • Pointer size: 131 Bytes
  • Size of remote file: 617 kB
report/images/wordcloud_title.png ADDED

Git LFS Details

  • SHA256: 96d11bd23abcb7b66230c0fb596c74e6a5a65d03398b939d7ee9c352d7435a4a
  • Pointer size: 131 Bytes
  • Size of remote file: 626 kB
report/images/world_production_map.png ADDED
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ matplotlib
4
+ seaborn
5
+ missingno
6
+ wordcloud
7
+ plotly
8
+ pycountry
9
+ kaleido
10
+ scikit-learn
11
+ scikit-surprise
12
+ gradio
src/__pycache__/collaborative.cpython-310.pyc ADDED
Binary file (2.92 kB). View file
 
src/__pycache__/collaborative.cpython-313.pyc ADDED
Binary file (4.77 kB). View file
 
src/__pycache__/content_based.cpython-310.pyc ADDED
Binary file (4.62 kB). View file
 
src/__pycache__/content_based.cpython-313.pyc ADDED
Binary file (8.07 kB). View file
 
src/__pycache__/eda.cpython-310.pyc ADDED
Binary file (11.9 kB). View file
 
src/__pycache__/eda.cpython-313.pyc ADDED
Binary file (25 kB). View file
 
src/__pycache__/evaluation.cpython-310.pyc ADDED
Binary file (5.54 kB). View file
 
src/__pycache__/feature_engineering.cpython-310.pyc ADDED
Binary file (14.7 kB). View file
 
src/__pycache__/feature_engineering.cpython-313.pyc ADDED
Binary file (25.6 kB). View file
 
src/__pycache__/hybrid.cpython-310.pyc ADDED
Binary file (2.44 kB). View file
 
src/__pycache__/modeling.cpython-310.pyc ADDED
Binary file (8.28 kB). View file
 
src/__pycache__/modeling.cpython-313.pyc ADDED
Binary file (12.5 kB). View file
 
src/__pycache__/preprocessing.cpython-310.pyc ADDED
Binary file (6.93 kB). View file
 
src/__pycache__/preprocessing.cpython-313.pyc ADDED
Binary file (13.2 kB). View file
 
src/eda.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ import os
4
+ import pandas as pd
5
+ from wordcloud import WordCloud, STOPWORDS
6
+ import plotly.graph_objs as go
7
+ import plotly.io as pio
8
+ import pycountry
9
+
10
+
11
+ class EDA:
12
+ def __init__(self, dfs):
13
+ self.df = dfs["df"]
14
+ self.credits_df = dfs["credits_df"]
15
+ self.keywords_df = dfs["keywords_df"]
16
+ self.links_df = dfs["links_df"]
17
+ self.ratings_df = dfs["ratings_df"]
18
+ self.merged_df = dfs["merged_df"]
19
+ self.img_path = "D:/Uni/Term 6/Machine Learning/HomeWork/6/report/images/"
20
+ os.makedirs(self.img_path, exist_ok=True)
21
+
22
+ def plot_rating_distribution(self):
23
+ plt.figure(figsize=(10, 6))
24
+ sns.histplot(self.merged_df['rating'], bins=10, kde=False)
25
+ plt.title('Distribution of Movie Ratings')
26
+ plt.xlabel('Rating')
27
+ plt.ylabel('Frequency')
28
+ plt.savefig(os.path.join(self.img_path, "rating_distribution.png"), bbox_inches='tight')
29
+ plt.close()
30
+
31
+ def plot_release_year_distribution(self):
32
+ df = self.merged_df.copy()
33
+ df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
34
+ df['release_year'] = df['release_date'].dt.year
35
+ plt.figure(figsize=(12, 6))
36
+ sns.histplot(df['release_year'].dropna(), bins=50, kde=False)
37
+ plt.title('Distribution of Movie Release Years')
38
+ plt.xlabel('Release Year')
39
+ plt.ylabel('Number of Movies')
40
+ plt.savefig(os.path.join(self.img_path, "release_year_distribution.png"), bbox_inches='tight')
41
+ plt.close()
42
+
43
+ def plot_budget_vs_revenue(self):
44
+ plt.figure(figsize=(10, 6))
45
+ sns.scatterplot(data=self.merged_df, x='budget', y='revenue')
46
+ plt.title('Relationship between Movie Budget and Revenue')
47
+ plt.xlabel('Budget')
48
+ plt.ylabel('Revenue')
49
+ plt.savefig(os.path.join(self.img_path, "budget_vs_revenue.png"), bbox_inches='tight')
50
+ plt.close()
51
+
52
+ # Convert 'budget' and 'revenue' to numeric, coercing errors to NaN
53
+ self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce')
54
+ self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce')
55
+
56
+ # Fill NaN values in 'budget' and 'revenue' with 0, as 0 budget/revenue is a meaningful value
57
+ self.merged_df['budget'] = self.merged_df['budget'].fillna(0)
58
+ self.merged_df['revenue'] = self.merged_df['revenue'].fillna(0)
59
+
60
+ # Filter out movies with zero budget AND zero revenue
61
+ filtered_df = self.merged_df[(self.merged_df['budget'] > 0) | (self.merged_df['revenue'] > 0)].copy()
62
+ plt.figure(figsize=(10, 6))
63
+ sns.scatterplot(data=filtered_df, x='budget', y='revenue')
64
+ plt.title('Relationship between Movie Budget and Revenue (Filtered)')
65
+ plt.xlabel('Budget')
66
+ plt.ylabel('Revenue')
67
+ plt.savefig(os.path.join(self.img_path, "budget_vs_revenue_filtered.png"), bbox_inches='tight')
68
+ plt.close()
69
+
70
+ def plot_genre_counts(self):
71
+ genre_counts = {}
72
+ for genres_list in self.df['genres'].dropna():
73
+ if isinstance(genres_list, str):
74
+ genres = [genre.strip() for genre in genres_list.split(',')]
75
+ for genre in genres:
76
+ if genre:
77
+ genre_counts[genre] = genre_counts.get(genre, 0) + 1
78
+ top_n = 15
79
+ top_genres = pd.Series(genre_counts).sort_values(ascending=False).head(top_n)
80
+ plt.figure(figsize=(12, 8))
81
+ sns.barplot(x=top_genres.index, y=top_genres.values, palette='viridis')
82
+ plt.title('Top Movie Genres by Frequency')
83
+ plt.xlabel('Genre')
84
+ plt.ylabel('Frequency')
85
+ plt.xticks(rotation=45, ha='right')
86
+ plt.tight_layout()
87
+ plt.savefig(os.path.join(self.img_path, "top_genres.png"), bbox_inches='tight')
88
+ plt.close()
89
+
90
+ def plot_popularity_distribution(self):
91
+ plt.figure(figsize=(10, 6))
92
+ sns.histplot(self.merged_df['popularity'], bins=50, kde=False)
93
+ plt.title('Distribution of Movie Popularity')
94
+ plt.xlabel('Popularity')
95
+ plt.ylabel('Frequency')
96
+ plt.savefig(os.path.join(self.img_path, "popularity_distribution.png"), bbox_inches='tight')
97
+ plt.close()
98
+
99
+ filtered_popularity_df = self.merged_df[self.merged_df['popularity'] < 100].copy()
100
+ plt.figure(figsize=(10, 6))
101
+ sns.histplot(filtered_popularity_df['popularity'], bins=50, kde=False)
102
+ plt.title('Distribution of Movie Popularity (Popularity < 100)')
103
+ plt.xlabel('Popularity')
104
+ plt.ylabel('Frequency')
105
+ plt.savefig(os.path.join(self.img_path, "popularity_distribution_lt100.png"), bbox_inches='tight')
106
+ plt.close()
107
+
108
+ filtered_popularity_df_low = self.merged_df[self.merged_df['popularity'] < 10].copy()
109
+ plt.figure(figsize=(10, 6))
110
+ sns.histplot(filtered_popularity_df_low['popularity'], bins=50, kde=False)
111
+ plt.title('Distribution of Movie Popularity (Popularity < 10)')
112
+ plt.xlabel('Popularity')
113
+ plt.ylabel('Frequency')
114
+ plt.savefig(os.path.join(self.img_path, "popularity_distribution_lt10.png"), bbox_inches='tight')
115
+ plt.close()
116
+
117
+ def plot_runtime_distribution(self):
118
+ plt.figure(figsize=(10, 6))
119
+ sns.histplot(self.merged_df['runtime'].dropna(), bins=50, kde=False)
120
+ plt.title('Distribution of Movie Runtimes')
121
+ plt.xlabel('Runtime (minutes)')
122
+ plt.ylabel('Frequency')
123
+ plt.savefig(os.path.join(self.img_path, "runtime_distribution.png"), bbox_inches='tight')
124
+ plt.close()
125
+
126
+ def plot_production_company_counts(self):
127
+ company_counts = {}
128
+ for companies_list in self.merged_df['production_companies'].dropna():
129
+ if isinstance(companies_list, str):
130
+ companies = [company.strip() for company in companies_list.split(',')]
131
+ for company in companies:
132
+ if company and company != 'Unknown':
133
+ company_counts[company] = company_counts.get(company, 0) + 1
134
+ top_n_companies = 15
135
+ top_companies = pd.Series(company_counts).sort_values(ascending=False).head(top_n_companies)
136
+ plt.figure(figsize=(14, 8))
137
+ sns.barplot(x=top_companies.index, y=top_companies.values, palette='viridis')
138
+ plt.title(f'Top {top_n_companies} Production Companies')
139
+ plt.xlabel('Production Company')
140
+ plt.ylabel('Frequency')
141
+ plt.xticks(rotation=45, ha='right')
142
+ plt.tight_layout()
143
+ plt.savefig(os.path.join(self.img_path, "top_production_companies.png"), bbox_inches='tight')
144
+ plt.close()
145
+
146
+ def plot_production_country_counts(self):
147
+ country_counts = {}
148
+ for countries_list in self.merged_df['production_countries'].dropna():
149
+ if isinstance(countries_list, str):
150
+ countries = [country.strip() for country in countries_list.split(',')]
151
+ for country in countries:
152
+ if country and country != 'Unknown':
153
+ country_counts[country] = country_counts.get(country, 0) + 1
154
+ top_n_countries = 15
155
+ top_countries = pd.Series(country_counts).sort_values(ascending=False).head(top_n_countries)
156
+ plt.figure(figsize=(14, 8))
157
+ sns.barplot(x=top_countries.index, y=top_countries.values, palette='magma')
158
+ plt.title(f'Top {top_n_countries} Production Countries')
159
+ plt.xlabel('Production Country')
160
+ plt.ylabel('Frequency')
161
+ plt.xticks(rotation=45, ha='right')
162
+ plt.tight_layout()
163
+ plt.savefig(os.path.join(self.img_path, "top_production_countries.png"), bbox_inches='tight')
164
+ plt.close()
165
+
166
+ def plot_language_counts(self):
167
+ language_counts = {}
168
+ for languages_list in self.merged_df['spoken_languages'].dropna():
169
+ if isinstance(languages_list, str):
170
+ languages = [lang.strip() for lang in languages_list.split(',')]
171
+ for lang in languages:
172
+ if lang and lang != 'Unknown':
173
+ language_counts[lang] = language_counts.get(lang, 0) + 1
174
+ language_counts_series = pd.Series(language_counts).sort_values(ascending=False)
175
+ top_languages = language_counts_series.head(15)
176
+ plt.figure(figsize=(12, 8))
177
+ sns.barplot(x=top_languages.index, y=top_languages.values, palette='viridis')
178
+ plt.title('Top 15 Spoken Languages')
179
+ plt.xlabel('Language')
180
+ plt.ylabel('Frequency')
181
+ plt.xticks(rotation=45, ha='right')
182
+ plt.tight_layout()
183
+ plt.savefig(os.path.join(self.img_path, "top_languages.png"), bbox_inches='tight')
184
+ plt.close()
185
+
186
+ def plot_vote_count_distribution(self):
187
+ plt.figure(figsize=(10, 6))
188
+ sns.histplot(self.merged_df['vote_count'], bins=50, kde=False)
189
+ plt.title('Distribution of Movie Vote Counts')
190
+ plt.xlabel('Vote Count')
191
+ plt.ylabel('Frequency')
192
+ plt.savefig(os.path.join(self.img_path, "vote_count_distribution.png"), bbox_inches='tight')
193
+ plt.close()
194
+
195
+ def plot_vote_average_distribution(self):
196
+ plt.figure(figsize=(10, 6))
197
+ sns.histplot(self.merged_df['vote_average'], bins=20, kde=False)
198
+ plt.title('Distribution of Movie Vote Averages')
199
+ plt.xlabel('Vote Average')
200
+ plt.ylabel('Frequency')
201
+ plt.savefig(os.path.join(self.img_path, "vote_average_distribution.png"), bbox_inches='tight')
202
+ plt.close()
203
+
204
+ def plot_vote_count_vs_average(self):
205
+ plt.figure(figsize=(10, 6))
206
+ sns.scatterplot(data=self.merged_df, x='vote_count', y='vote_average')
207
+ plt.title('Relationship between Vote Count and Vote Average')
208
+ plt.xlabel('Vote Count')
209
+ plt.ylabel('Vote Average')
210
+ plt.savefig(os.path.join(self.img_path, "vote_count_vs_average.png"), bbox_inches='tight')
211
+ plt.close()
212
+
213
+ def plot_wordclouds(self):
214
+ copy = self.df.copy()
215
+ copy['title'] = copy['title'].astype('str')
216
+ copy['overview'] = copy['overview'].astype('str')
217
+ title_corpus = ' '.join(copy['title'])
218
+ overview_corpus = ' '.join(copy['overview'])
219
+
220
+ title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(title_corpus)
221
+ plt.figure(figsize=(16,8))
222
+ plt.imshow(title_wordcloud)
223
+ plt.axis('off')
224
+ plt.tight_layout()
225
+ plt.savefig(os.path.join(self.img_path, "wordcloud_title.png"), bbox_inches='tight')
226
+ plt.close()
227
+
228
+ overview_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(overview_corpus)
229
+ plt.figure(figsize=(16,8))
230
+ plt.imshow(overview_wordcloud)
231
+ plt.axis('off')
232
+ plt.tight_layout()
233
+ plt.savefig(os.path.join(self.img_path, "wordcloud_overview.png"), bbox_inches='tight')
234
+ plt.close()
235
+
236
+ def plot_world_production_map(self):
237
+
238
+ copy = self.df.copy()
239
+ country_counts = copy['production_countries'].value_counts().reset_index()
240
+ country_counts.columns = ['country', 'num_movies']
241
+ country_counts = country_counts[country_counts['country'] != "United States of America"]
242
+
243
+ def get_iso3(country_name):
244
+ try:
245
+ return pycountry.countries.lookup(country_name).alpha_3
246
+ except:
247
+ return None
248
+
249
+ country_counts['iso_alpha'] = country_counts['country'].apply(get_iso3)
250
+ country_counts = country_counts.dropna(subset=['iso_alpha'])
251
+
252
+ data = [go.Choropleth(
253
+ locations = country_counts['iso_alpha'],
254
+ z = country_counts['num_movies'],
255
+ text = country_counts['country'],
256
+ colorscale = [[0,'rgb(255,255,255)'], [1,'rgb(255,0,0)']],
257
+ autocolorscale = False,
258
+ reversescale = False,
259
+ marker = dict(line = dict(color='rgb(180,180,180)', width=0.5)),
260
+ colorbar = dict(title='Production Countries')
261
+ )]
262
+
263
+ layout = dict(
264
+ title = 'Production Countries for the MovieLens Movies (Apart from US)',
265
+ geo = dict(
266
+ showframe = False,
267
+ showcoastlines = False,
268
+ projection = dict(type = 'mercator')
269
+ )
270
+ )
271
+
272
+ fig = go.Figure(data=data, layout=layout)
273
+ # Save as static image (requires kaleido)
274
+ try:
275
+ # Use plotly.io.write_image for better compatibility
276
+ pio.write_image(fig, os.path.join(self.img_path, "world_production_map.png"))
277
+ except Exception:
278
+ # As a fallback, save as HTML if static image export fails
279
+ try:
280
+ fig.write_html(os.path.join(self.img_path, "world_production_map.html"))
281
+ except Exception:
282
+ pass
283
+
284
+ def plot_decade_pie(self):
285
+ import plotly.express as px
286
+ copy = self.df.copy()
287
+ copy['release_date'] = pd.to_datetime(copy['release_date'], errors='coerce')
288
+ copy['decade'] = (copy['release_date'].dt.year // 10) * 10
289
+ decade_counts = copy['decade'].value_counts().sort_index().reset_index()
290
+ decade_counts.columns = ['decade', 'num_movies']
291
+ decade_counts['decade'] = decade_counts['decade'].astype(int).astype(str) + "s"
292
+ fig = px.pie(
293
+ decade_counts,
294
+ names='decade',
295
+ values='num_movies',
296
+ title="Movies Distribution by Decade (Release Date)",
297
+ color_discrete_sequence=px.colors.qualitative.Set3
298
+ )
299
+ # Save as static image (requires kaleido)
300
+ try:
301
+ # Use plotly.io.write_image for better compatibility
302
+ pio.write_image(fig, os.path.join(self.img_path, "movies_by_decade_pie.png"))
303
+ except Exception:
304
+ # As a fallback, save as HTML if static image export fails
305
+ try:
306
+ fig.write_html(os.path.join(self.img_path, "movies_by_decade_pie.html"))
307
+ except Exception:
308
+ pass
309
+
310
+
311
+
312
+ def run_all(self):
313
+ self.plot_rating_distribution()
314
+ self.plot_release_year_distribution()
315
+ self.plot_budget_vs_revenue()
316
+ self.plot_genre_counts()
317
+ self.plot_popularity_distribution()
318
+ self.plot_runtime_distribution()
319
+ self.plot_production_company_counts()
320
+ self.plot_production_country_counts()
321
+ self.plot_language_counts()
322
+ self.plot_vote_count_distribution()
323
+ self.plot_vote_average_distribution()
324
+ self.plot_vote_count_vs_average()
325
+ self.plot_wordclouds()
326
+ self.plot_world_production_map()
327
+ self.plot_decade_pie()
src/evaluation.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from collections import defaultdict
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+
6
+ def leave_one_out_by_timestamp(ratings_df):
7
+ ratings_df = ratings_df.sort_values(['userId', 'timestamp'])
8
+ train_idx, test_idx = [], []
9
+ for user, group in ratings_df.groupby('userId'):
10
+ if len(group) > 1:
11
+ test_idx.append(group.index[-1])
12
+ train_idx.extend(group.index[:-1])
13
+ else:
14
+ test_idx.append(group.index[-1])
15
+ train = ratings_df.loc[train_idx]
16
+ test = ratings_df.loc[test_idx]
17
+ return train, test
18
+
19
+ def precision_at_k(ranked_lists, k=10):
20
+ precisions = []
21
+ for uid, items in ranked_lists.items():
22
+ relevant = [r for _, _, r in items[:k] if r >= 4]
23
+ precisions.append(len(relevant) / k)
24
+ return np.mean(precisions)
25
+
26
+ def recall_at_k(ranked_lists, test_truth, k=10):
27
+ recalls = []
28
+ truth = defaultdict(set)
29
+ # Accept both DataFrame and ndarray for test_truth
30
+ if isinstance(test_truth, pd.DataFrame):
31
+ for _, row in test_truth.iterrows():
32
+ uid, iid, r = row['userId'], row['movieId'], row['rating']
33
+ if r >= 4:
34
+ truth[uid].add(iid)
35
+ else:
36
+ for row in test_truth:
37
+ # row can be (uid, iid, r, ...) or (uid, iid, r)
38
+ uid, iid, r = row[:3]
39
+ if r >= 4:
40
+ truth[uid].add(iid)
41
+ for uid, items in ranked_lists.items():
42
+ recommended = {iid for iid, _, _ in items[:k]}
43
+ relevant = truth.get(uid, set())
44
+ if relevant:
45
+ recalls.append(len(recommended & relevant) / len(relevant))
46
+ return np.mean(recalls)
47
+
48
+ def ndcg_at_k(ranked_lists, k=10):
49
+ ndcgs = []
50
+ for uid, items in ranked_lists.items():
51
+ dcg = 0.0
52
+ idcg = 0.0
53
+ rels = [1 if r >= 4 else 0 for _, _, r in items[:k]]
54
+ for i, rel in enumerate(rels):
55
+ dcg += (2**rel - 1) / np.log2(i + 2)
56
+ ideal_rels = sorted(rels, reverse=True)
57
+ for i, rel in enumerate(ideal_rels):
58
+ idcg += (2**rel - 1) / np.log2(i + 2)
59
+ if idcg > 0:
60
+ ndcgs.append(dcg / idcg)
61
+ return np.mean(ndcgs)
62
+
63
+ def catalog_coverage(ranked_lists, all_items):
64
+ recommended = {iid for items in ranked_lists.values() for iid, _, _ in items}
65
+ return len(recommended) / len(all_items)
66
+
67
+ def novelty(ranked_lists, item_popularity):
68
+ novelties = []
69
+ total = sum(item_popularity.values())
70
+ for items in ranked_lists.values():
71
+ for iid, _, _ in items:
72
+ p = item_popularity.get(iid, 1) / total
73
+ novelties.append(-np.log2(p + 1e-9))
74
+ return np.mean(novelties)
75
+
76
+ def intra_list_diversity(ranked_lists, item_features):
77
+ diversities = []
78
+ for items in ranked_lists.values():
79
+ iids = [iid for iid, _, _ in items]
80
+ feats = [item_features[iid] for iid in iids if iid in item_features]
81
+ if len(feats) > 1:
82
+ sims = cosine_similarity(feats)
83
+ upper = sims[np.triu_indices_from(sims, k=1)]
84
+ diversities.append(1 - np.mean(upper))
85
+ return np.mean(diversities)
86
+
87
+ def predictions_to_ranked_lists(predictions, k=20):
88
+ user_items = defaultdict(list)
89
+ for uid, iid, true_r, est, _ in predictions:
90
+ user_items[uid].append((iid, est, true_r))
91
+ ranked = {}
92
+ for uid, items in user_items.items():
93
+ ranked[uid] = sorted(items, key=lambda x: x[1], reverse=True)[:k]
94
+ return ranked
95
+
96
+ def evaluate_all(predictions, testset, all_items, item_popularity, item_features, k_list=[10, 20]):
97
+ ranked_lists = predictions_to_ranked_lists(predictions, k=max(k_list))
98
+ results = {}
99
+ for k in k_list:
100
+ results[f'Precision@{k}'] = precision_at_k(ranked_lists, k)
101
+ results[f'Recall@{k}'] = recall_at_k(ranked_lists, testset, k)
102
+ results[f'NDCG@{k}'] = ndcg_at_k(ranked_lists, k)
103
+ results['Coverage'] = catalog_coverage(ranked_lists, all_items)
104
+ results['Novelty'] = novelty(ranked_lists, item_popularity)
105
+ results['Diversity'] = intra_list_diversity(ranked_lists, item_features)
106
+ return results
107
+
108
+ def summarize_results(results_dict):
109
+ return pd.DataFrame(results_dict).T
110
+
111
+ def bootstrap_metric(metric_func, predictions, testset, all_items, item_popularity, item_features, n_bootstrap=100, k=10):
112
+ scores = []
113
+ uids = list({p[0] for p in predictions})
114
+ for _ in range(n_bootstrap):
115
+ sampled_uids = np.random.choice(uids, size=len(uids), replace=True)
116
+ sampled_preds = [p for p in predictions if p[0] in sampled_uids]
117
+ ranked_lists = predictions_to_ranked_lists(sampled_preds, k)
118
+ score = metric_func(ranked_lists, k)
119
+ scores.append(score)
120
+ return np.percentile(scores, [2.5, 97.5])
121
+
src/feature_engineering.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ import os
5
+ from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
6
+ from sklearn.decomposition import TruncatedSVD
7
+
8
+ class FeatureEngineering:
9
+ def __init__(self, dfs, interim_path="D:/Uni/Term 6/Machine Learning/HomeWork/6/data/interim/"):
10
+ self.merged_df = dfs["merged_df"]
11
+ self.ratings_df = dfs["ratings_df"]
12
+ self.interim_path = interim_path
13
+ os.makedirs(self.interim_path, exist_ok=True)
14
+
15
+ def ordering(self):
16
+ self.merged_df = self.merged_df.drop(columns=['id', 'tmdbId', 'imdbId', 'imdb_id', 'original_title', 'video'])
17
+ desired_column_order = [
18
+ 'movieId',
19
+ 'title',
20
+ 'release_date',
21
+ 'runtime',
22
+ 'status',
23
+ 'adult',
24
+ 'budget',
25
+ 'revenue',
26
+ 'popularity',
27
+ 'vote_average',
28
+ 'vote_count',
29
+ 'overview',
30
+ 'genres',
31
+ 'keywords',
32
+ 'cast',
33
+ 'crew',
34
+ 'production_companies',
35
+ 'production_countries',
36
+ 'original_language',
37
+ 'userId',
38
+ 'rating',
39
+ ]
40
+
41
+ self.merged_df = self.merged_df.reindex(columns=desired_column_order)
42
+
43
+ def outliers(self):
44
+ self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
45
+ self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
46
+ self.merged_df = self.merged_df[self.merged_df['runtime'] > 0]
47
+ self.merged_df = self.merged_df[self.merged_df['budget'] >= 0]
48
+ self.merged_df = self.merged_df[self.merged_df['revenue'] >= 0]
49
+
50
+ for col in ['budget', 'revenue']:
51
+ upper = self.merged_df[col].quantile(0.995)
52
+ self.merged_df = self.merged_df[self.merged_df[col] <= upper]
53
+
54
+ def add_budget_to_revenue_ratio(self):
55
+ self.merged_df['budget'] = pd.to_numeric(self.merged_df['budget'], errors='coerce').fillna(0)
56
+ self.merged_df['revenue'] = pd.to_numeric(self.merged_df['revenue'], errors='coerce').fillna(0)
57
+ self.merged_df['budget_to_revenue_ratio'] = self.merged_df.apply(
58
+ lambda row: row['budget'] / row['revenue'] if row['revenue'] > 0 else 0, axis=1
59
+ )
60
+
61
+ def add_top_genre_onehot(self, top_n=5):
62
+ genre_dummies = self.merged_df['genres'].str.get_dummies(sep=', ')
63
+ top_genres = genre_dummies.sum().sort_values(ascending=False).head(top_n).index
64
+ for genre in top_genres:
65
+ self.merged_df[f"genre_{genre}"] = genre_dummies[genre]
66
+
67
+ def add_log_features(self):
68
+ for col in ['budget', 'revenue', 'popularity', 'vote_count']:
69
+ self.merged_df[f'log_{col}'] = np.log1p(self.merged_df[col])
70
+
71
+ def add_interaction_features(self):
72
+ self.merged_df['budget_x_popularity'] = self.merged_df['budget'] * self.merged_df['popularity']
73
+ self.merged_df['budget_x_vote_count'] = self.merged_df['budget'] * self.merged_df['vote_count']
74
+
75
+ def add_count_features(self):
76
+ self.merged_df['num_genres'] = self.merged_df['genres'].fillna('').apply(lambda x: len([g for g in x.split(',') if g.strip()]))
77
+ self.merged_df['num_keywords'] = self.merged_df['keywords'].fillna('').apply(lambda x: len([k for k in x.split(',') if k.strip()]))
78
+ self.merged_df['num_cast'] = self.merged_df['cast'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
79
+ self.merged_df['num_crew'] = self.merged_df['crew'].fillna('').apply(lambda x: len([c for c in x.split(',') if c.strip()]))
80
+
81
+ def add_text_length_features(self):
82
+ self.merged_df['overview_length'] = self.merged_df['overview'].fillna('').apply(len)
83
+ self.merged_df['title_length'] = self.merged_df['title'].fillna('').apply(len)
84
+
85
+ def add_genre_mean_encoding(self):
86
+ genre_ratings = {}
87
+ for genre in self.merged_df['genres'].str.split(',').explode().str.strip().unique():
88
+ if genre and genre != 'Unknown':
89
+ mask = self.merged_df['genres'].str.contains(rf'\b{genre}\b', regex=True)
90
+ genre_ratings[genre] = self.merged_df.loc[mask, 'vote_average'].mean()
91
+ for genre in list(genre_ratings.keys())[:10]:
92
+ self.merged_df[f'genre_{genre}_mean_vote'] = self.merged_df['genres'].apply(
93
+ lambda x: genre_ratings[genre] if genre in x else np.nan
94
+ )
95
+
96
+ def add_release_date_features(self):
97
+ self.merged_df['release_date'] = pd.to_datetime(self.merged_df['release_date'], errors='coerce')
98
+ self.merged_df['release_year'] = self.merged_df['release_date'].dt.year
99
+ self.merged_df.drop(columns=['release_date'], inplace=True)
100
+
101
+
102
+ def add_adult_flag(self):
103
+ if 'adult' in self.merged_df.columns:
104
+ self.merged_df['is_adult'] = self.merged_df['adult'].map({'True': 1, 'False': 0})
105
+ self.merged_df.drop(columns=['adult'], inplace=True)
106
+
107
+ def add_multi_hot_keywords(self, top_n=20):
108
+ keywords_split = self.merged_df['keywords'].fillna('').apply(lambda x: [k.strip() for k in x.split(',') if k.strip()])
109
+ mlb = MultiLabelBinarizer()
110
+ top_keywords = pd.Series([k for sublist in keywords_split for k in sublist]).value_counts().head(top_n).index
111
+ keywords_filtered = keywords_split.apply(lambda x: [k for k in x if k in top_keywords])
112
+ keyword_dummies = pd.DataFrame(mlb.fit_transform(keywords_filtered), columns=[f'kw_{k}' for k in mlb.classes_], index=self.merged_df.index)
113
+ self.merged_df = pd.concat([self.merged_df, keyword_dummies], axis=1)
114
+
115
+ def add_cast_crew_features(self, top_n_cast=5, top_n_crew=5):
116
+ cast_split = self.merged_df['cast'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
117
+ crew_split = self.merged_df['crew'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
118
+ mlb_cast = MultiLabelBinarizer()
119
+ mlb_crew = MultiLabelBinarizer()
120
+ top_cast = pd.Series([c for sublist in cast_split for c in sublist]).value_counts().head(top_n_cast).index
121
+ top_crew = pd.Series([c for sublist in crew_split for c in sublist]).value_counts().head(top_n_crew).index
122
+ cast_filtered = cast_split.apply(lambda x: [c for c in x if c in top_cast])
123
+ crew_filtered = crew_split.apply(lambda x: [c for c in x if c in top_crew])
124
+ cast_dummies = pd.DataFrame(mlb_cast.fit_transform(cast_filtered), columns=[f'cast_{c}' for c in mlb_cast.classes_], index=self.merged_df.index)
125
+ crew_dummies = pd.DataFrame(mlb_crew.fit_transform(crew_filtered), columns=[f'crew_{c}' for c in mlb_crew.classes_], index=self.merged_df.index)
126
+ self.merged_df = pd.concat([self.merged_df, cast_dummies, crew_dummies], axis=1)
127
+
128
+ def add_company_country_features(self, top_n_company=5, top_n_country=5):
129
+ company_split = self.merged_df['production_companies'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
130
+ country_split = self.merged_df['production_countries'].fillna('').apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
131
+ mlb_company = MultiLabelBinarizer()
132
+ mlb_country = MultiLabelBinarizer()
133
+ top_company = pd.Series([c for sublist in company_split for c in sublist]).value_counts().head(top_n_company).index
134
+ top_country = pd.Series([c for sublist in country_split for c in sublist]).value_counts().head(top_n_country).index
135
+ company_filtered = company_split.apply(lambda x: [c for c in x if c in top_company])
136
+ country_filtered = country_split.apply(lambda x: [c for c in x if c in top_country])
137
+ company_dummies = pd.DataFrame(mlb_company.fit_transform(company_filtered), columns=[f'company_{c}' for c in mlb_company.classes_], index=self.merged_df.index)
138
+ country_dummies = pd.DataFrame(mlb_country.fit_transform(country_filtered), columns=[f'country_{c}' for c in mlb_country.classes_], index=self.merged_df.index)
139
+ self.merged_df = pd.concat([self.merged_df, company_dummies, country_dummies], axis=1)
140
+
141
+ def add_target_encoding(self, col, target='vote_average', top_n=10):
142
+ values = pd.Series([v for sublist in self.merged_df[col].fillna('').apply(lambda x: [i.strip() for i in x.split(',') if i.strip()]) for v in sublist])
143
+ top_values = values.value_counts().head(top_n).index
144
+ for v in top_values:
145
+ mask = self.merged_df[col].str.contains(rf'\b{v}\b', regex=True)
146
+ mean_val = self.merged_df.loc[mask, target].mean()
147
+ self.merged_df[f'{col}_{v}_mean_{target}'] = mask.astype(int) * mean_val
148
+
149
+ def coding(self):
150
+ self.add_target_encoding(col='genres')
151
+ self.add_target_encoding(col='production_companies')
152
+
153
+ def Tfidf(self):
154
+ tfidf_overview_vectorizer = TfidfVectorizer(max_features=2100, stop_words='english')
155
+ tfidf_overview_matrix = tfidf_overview_vectorizer.fit_transform(self.merged_df['overview'].fillna(''))
156
+ self.tfidf_overview_df = pd.DataFrame(tfidf_overview_matrix.toarray(), columns=[f'overview_tfidf_{col}' for col in tfidf_overview_vectorizer.get_feature_names_out()], index=self.merged_df.index)
157
+
158
+ def merging_Tfidf(self):
159
+ # Combine the original dataframe with the TF-IDF features
160
+ self.merged_df_with_tfidf = pd.concat([self.merged_df, self.tfidf_overview_df], axis=1)
161
+
162
+ def presvd(self):
163
+ columns_for_svd = self.merged_df_with_tfidf.select_dtypes(include=np.number).columns.tolist()
164
+ columns_for_svd = [col for col in columns_for_svd if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year']] # Exclude non-feature columns and year
165
+
166
+ for col in columns_for_svd:
167
+ if self.merged_df_with_tfidf[col].isnull().any():
168
+ median_val = self.merged_df_with_tfidf[col].median()
169
+ self.merged_df_with_tfidf[col] = self.merged_df_with_tfidf[col].fillna(median_val)
170
+ if 'production_companies_Warner Bros._mean_vote_average' in self.merged_df_with_tfidf.columns:
171
+ self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'] = self.merged_df_with_tfidf['production_companies_Warner Bros._mean_vote_average'].fillna(0)
172
+
173
+
174
+ def svd(self):
175
+ unique_movies_df = self.merged_df_with_tfidf.groupby('movieId').first().reset_index()
176
+ columns_for_svd_unique = unique_movies_df.select_dtypes(include=np.number).columns.tolist()
177
+ columns_for_svd_unique = [col for col in columns_for_svd_unique if col not in ['rating', 'movieId', 'userId', 'timestamp', 'release_year', 'vote_average', 'vote_count']]
178
+
179
+ # Fill NaNs with median for all SVD columns
180
+ for col in columns_for_svd_unique:
181
+ if unique_movies_df[col].isnull().any():
182
+ median_val = unique_movies_df[col].median()
183
+ unique_movies_df[col] = unique_movies_df[col].fillna(median_val)
184
+ # Extra: fill any remaining NaNs with 0 (safety for SVD)
185
+ unique_movies_df[columns_for_svd_unique] = unique_movies_df[columns_for_svd_unique].fillna(0)
186
+
187
+ if 'production_companies_Warner Bros._mean_vote_average' in unique_movies_df.columns:
188
+ unique_movies_df['production_companies_Warner Bros._mean_vote_average'] = unique_movies_df['production_companies_Warner Bros._mean_vote_average'].fillna(0)
189
+
190
+
191
+ n_components = 150
192
+ svd = TruncatedSVD(n_components=n_components, random_state=42)
193
+ svd_matrix_unique = svd.fit_transform(unique_movies_df[columns_for_svd_unique])
194
+ svd_df_unique = pd.DataFrame(svd_matrix_unique, columns=[f'svd_{i+1}' for i in range(n_components)], index=unique_movies_df.index)
195
+ columns_to_drop_after_svd_unique = [col for col in columns_for_svd_unique if col not in ['vote_average', 'vote_count']]
196
+ self.unique_movies_reduced = unique_movies_df.drop(columns=columns_to_drop_after_svd_unique).copy()
197
+ self.unique_movies_reduced = pd.concat([self.unique_movies_reduced, svd_df_unique], axis=1)
198
+
199
+ def run_all(self):
200
+ self.ordering()
201
+ self.outliers()
202
+ self.add_budget_to_revenue_ratio()
203
+ self.add_top_genre_onehot()
204
+ self.add_log_features()
205
+ self.add_interaction_features()
206
+ self.add_count_features()
207
+ self.add_text_length_features()
208
+ self.add_genre_mean_encoding()
209
+ self.add_release_date_features()
210
+ self.add_adult_flag()
211
+ self.add_multi_hot_keywords()
212
+ self.add_cast_crew_features()
213
+ self.add_company_country_features()
214
+ self.coding()
215
+ self.Tfidf()
216
+ self.merging_Tfidf()
217
+ self.presvd()
218
+ self.svd()
219
+
220
+ return {
221
+ "merged_df": self.merged_df,
222
+ "merged_df_with_tfidf": self.merged_df_with_tfidf,
223
+ "unique_movies_reduced": self.unique_movies_reduced
224
+ }