import sys import os # Add the parent directory to sys.path so 'src' can be imported sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from src.preprocessing import Preprocessing from src.eda import EDA from src.feature_engineering import FeatureEngineering from src.modeling import RecommenderModels from src.evaluation import leave_one_out_by_timestamp, evaluate_all, summarize_results def main(): print("========== Step 1: Preprocessing ==========") preprocessor = Preprocessing() dfs = preprocessor.run_all() # print("========== Step 2: Exploratory Data Analysis (EDA) ==========") # eda = EDA(dfs) # eda.run_all() print("========== Step 3: Feature Engineering ==========") fe = FeatureEngineering(dfs) fe_outputs = fe.run_all() merged_df = fe_outputs["merged_df"] merged_df_with_tfidf = fe_outputs["merged_df_with_tfidf"] unique_movies_reduced = fe_outputs["unique_movies_reduced"] ratings_df = dfs["ratings_df"] print("========== Step 4: Modeling & Recommendation ==========") models = RecommenderModels( merged_df_with_tfidf=merged_df_with_tfidf, unique_movies_reduced=unique_movies_reduced, ratings_df=ratings_df ) models.fit_popularity() models.fit_content_based() models.fit_cf() print("CF RMSEs (kNN, SVD):", models.evaluate_cf()) rmse_scores, best_alpha = models.tune_hybrid_alpha() print("Best alpha:", best_alpha) print("Hybrid RMSE:", models.evaluate_hybrid()) models.save_models() # Example: get recommendations for user 1 print("Top 10 Content-Based Recommendations for user 1:") print(models.get_content_based_recommendations(user_id=1, top_n=10)) print("========== Step 5: Evaluation ==========") # Time-aware split train_ratings, test_ratings = leave_one_out_by_timestamp(ratings_df) all_items = set(merged_df_with_tfidf['movieId'].astype(str).unique()) item_popularity = merged_df_with_tfidf['movieId'].value_counts().to_dict() item_popularity = {str(k): v for k, v in item_popularity.items()} svd_cols = [col for col in unique_movies_reduced.columns if col.startswith("svd_")] item_features = { str(row.movieId): row[svd_cols].values for _, row in unique_movies_reduced.iterrows() } # Generate predictions for each model # Implement prediction methods if not present in RecommenderModels def predict_content_based(models, test_df): preds = [] for _, row in test_df.iterrows(): user_id = row['userId'] movie_id = row['movieId'] true_rating = row['rating'] pred_rating = models.get_content_based_score(user_id, movie_id) preds.append((user_id, movie_id, true_rating, pred_rating, {})) return preds def predict_collaborative(models, test_df): preds = [] for _, row in test_df.iterrows(): user_id = row['userId'] movie_id = row['movieId'] true_rating = row['rating'] # Use SVD as the collaborative model (or knn_user_based if you prefer) try: pred_rating = models.svd_mf.predict(str(user_id), str(movie_id)).est except Exception: pred_rating = 0 preds.append((user_id, movie_id, true_rating, pred_rating, {})) return preds def predict_hybrid(models, test_df, alpha): preds = [] for _, row in test_df.iterrows(): user_id = row['userId'] movie_id = row['movieId'] true_rating = row['rating'] pred_rating = models.hybrid_prediction(user_id, movie_id, alpha) preds.append((user_id, movie_id, true_rating, pred_rating, {})) return preds predictions_cb = predict_content_based(models, test_ratings) predictions_cf = predict_collaborative(models, test_ratings) predictions_hybrid = predict_hybrid(models, test_ratings, alpha=best_alpha) # Evaluate results_cb = evaluate_all(predictions_cb, test_ratings.values, all_items, item_popularity, item_features) results_cf = evaluate_all(predictions_cf, test_ratings.values, all_items, item_popularity, item_features) results_hybrid = evaluate_all(predictions_hybrid, test_ratings.values, all_items, item_popularity, item_features) # Print summary table summary = summarize_results({ "Content-Based": results_cb, "Collaborative": results_cf, "Hybrid": results_hybrid }) print(summary) if __name__ == "__main__": main()