File size: 4,601 Bytes
c296592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import sys
import os

# Add the parent directory to sys.path so 'src' can be imported
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from src.preprocessing import Preprocessing
from src.eda import EDA
from src.feature_engineering import FeatureEngineering
from src.modeling import RecommenderModels
from src.evaluation import leave_one_out_by_timestamp, evaluate_all, summarize_results

def main():
    print("========== Step 1: Preprocessing ==========")
    preprocessor = Preprocessing()
    dfs = preprocessor.run_all()

    # print("========== Step 2: Exploratory Data Analysis (EDA) ==========")
    # eda = EDA(dfs)
    # eda.run_all()

    print("========== Step 3: Feature Engineering ==========")
    fe = FeatureEngineering(dfs)
    fe_outputs = fe.run_all()
    merged_df = fe_outputs["merged_df"]
    merged_df_with_tfidf = fe_outputs["merged_df_with_tfidf"]
    unique_movies_reduced = fe_outputs["unique_movies_reduced"]
    ratings_df = dfs["ratings_df"]

    print("========== Step 4: Modeling & Recommendation ==========")
    models = RecommenderModels(
        merged_df_with_tfidf=merged_df_with_tfidf,
        unique_movies_reduced=unique_movies_reduced, 
        ratings_df=ratings_df
    )
    models.fit_popularity()
    models.fit_content_based()
    models.fit_cf()
    print("CF RMSEs (kNN, SVD):", models.evaluate_cf())
    rmse_scores, best_alpha = models.tune_hybrid_alpha()
    print("Best alpha:", best_alpha)
    print("Hybrid RMSE:", models.evaluate_hybrid())
    models.save_models()
    # Example: get recommendations for user 1
    print("Top 10 Content-Based Recommendations for user 1:")
    print(models.get_content_based_recommendations(user_id=1, top_n=10))

    print("========== Step 5: Evaluation ==========")
    # Time-aware split
    train_ratings, test_ratings = leave_one_out_by_timestamp(ratings_df)
    all_items = set(merged_df_with_tfidf['movieId'].astype(str).unique())
    item_popularity = merged_df_with_tfidf['movieId'].value_counts().to_dict()
    item_popularity = {str(k): v for k, v in item_popularity.items()}
    svd_cols = [col for col in unique_movies_reduced.columns if col.startswith("svd_")]
    item_features = {
        str(row.movieId): row[svd_cols].values
        for _, row in unique_movies_reduced.iterrows()
    }

    # Generate predictions for each model
    # Implement prediction methods if not present in RecommenderModels
    def predict_content_based(models, test_df):
        preds = []
        for _, row in test_df.iterrows():
            user_id = row['userId']
            movie_id = row['movieId']
            true_rating = row['rating']
            pred_rating = models.get_content_based_score(user_id, movie_id)
            preds.append((user_id, movie_id, true_rating, pred_rating, {}))
        return preds

    def predict_collaborative(models, test_df):
        preds = []
        for _, row in test_df.iterrows():
            user_id = row['userId']
            movie_id = row['movieId']
            true_rating = row['rating']
            # Use SVD as the collaborative model (or knn_user_based if you prefer)
            try:
                pred_rating = models.svd_mf.predict(str(user_id), str(movie_id)).est
            except Exception:
                pred_rating = 0
            preds.append((user_id, movie_id, true_rating, pred_rating, {}))
        return preds

    def predict_hybrid(models, test_df, alpha):
        preds = []
        for _, row in test_df.iterrows():
            user_id = row['userId']
            movie_id = row['movieId']
            true_rating = row['rating']
            pred_rating = models.hybrid_prediction(user_id, movie_id, alpha)
            preds.append((user_id, movie_id, true_rating, pred_rating, {}))
        return preds

    predictions_cb = predict_content_based(models, test_ratings)
    predictions_cf = predict_collaborative(models, test_ratings)
    predictions_hybrid = predict_hybrid(models, test_ratings, alpha=best_alpha)

    # Evaluate
    results_cb = evaluate_all(predictions_cb, test_ratings.values, all_items, item_popularity, item_features)
    results_cf = evaluate_all(predictions_cf, test_ratings.values, all_items, item_popularity, item_features)
    results_hybrid = evaluate_all(predictions_hybrid, test_ratings.values, all_items, item_popularity, item_features)

    # Print summary table
    summary = summarize_results({
        "Content-Based": results_cb,
        "Collaborative": results_cf,
        "Hybrid": results_hybrid
    })
    print(summary)


if __name__ == "__main__":
    main()