""" Model training module for book popularity prediction. This module handles model training, evaluation, and saving. """ import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.model_selection import cross_val_score import joblib import os import matplotlib.pyplot as plt import seaborn as sns from data_preprocessing import BookDataPreprocessor class BookPopularityModel: def __init__(self): self.model = None self.preprocessor = BookDataPreprocessor() self.feature_importance = None def train_model(self, model_type='random_forest'): """Train the book popularity prediction model""" # Load and preprocess data print("Loading and preprocessing data...") df = self.preprocessor.load_data() if df is None: return False df_clean = self.preprocessor.clean_data(df) df_engineered = self.preprocessor.engineer_features(df_clean) X, y, df_final = self.preprocessor.prepare_features(df_engineered, fit=True) # Split data X_train, X_test, y_train, y_test = self.preprocessor.split_data(X, y) print(f"Training data shape: {X_train.shape}") print(f"Test data shape: {X_test.shape}") # Initialize model if model_type == 'random_forest': self.model = RandomForestRegressor( n_estimators=100, max_depth=10, random_state=42, n_jobs=-1 ) elif model_type == 'linear': self.model = LinearRegression() else: raise ValueError("Model type must be 'random_forest' or 'linear'") # Train model print(f"Training {model_type} model...") self.model.fit(X_train, y_train) # Evaluate model self.evaluate_model(X_train, X_test, y_train, y_test) # Save model and preprocessor self.save_model() self.preprocessor.save_preprocessor() return True def evaluate_model(self, X_train, X_test, y_train, y_test): """Evaluate the trained model""" # Predictions y_train_pred = self.model.predict(X_train) y_test_pred = self.model.predict(X_test) # Calculate metrics train_mse = mean_squared_error(y_train, y_train_pred) test_mse = mean_squared_error(y_test, y_test_pred) train_r2 = r2_score(y_train, y_train_pred) test_r2 = r2_score(y_test, y_test_pred) train_mae = mean_absolute_error(y_train, y_train_pred) test_mae = mean_absolute_error(y_test, y_test_pred) print("\\n" + "="*50) print("MODEL EVALUATION RESULTS") print("="*50) print(f"Train MSE: {train_mse:.4f}") print(f"Test MSE: {test_mse:.4f}") print(f"Train R²: {train_r2:.4f}") print(f"Test R²: {test_r2:.4f}") print(f"Train MAE: {train_mae:.4f}") print(f"Test MAE: {test_mae:.4f}") # Cross-validation score cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2') print(f"\\nCross-validation R² scores: {cv_scores}") print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") # Feature importance (if available) if hasattr(self.model, 'feature_importances_'): self.feature_importance = self.model.feature_importances_ feature_names = self.preprocessor.get_feature_names() print("\\nFeature Importance:") for name, importance in zip(feature_names, self.feature_importance): print(f"{name}: {importance:.4f}") print("="*50) def save_model(self, models_dir='models'): """Save the trained model""" if not os.path.exists(models_dir): os.makedirs(models_dir) model_path = f'{models_dir}/book_popularity_model.pkl' joblib.dump(self.model, model_path) print(f"Model saved to {model_path}") def load_model(self, models_dir='models'): """Load a previously trained model""" try: model_path = f'{models_dir}/book_popularity_model.pkl' self.model = joblib.load(model_path) # Also load preprocessor self.preprocessor.load_preprocessor(models_dir) print("Model and preprocessor loaded successfully!") return True except FileNotFoundError as e: print(f"Model not found: {e}") return False def predict_single_book(self, author, ratings_count, reviews_count=None): """Predict popularity for a single book""" if self.model is None: print("Model not trained or loaded!") return None # Handle missing reviews_count if reviews_count is None: reviews_count = ratings_count / 8.33 # Average ratio from dataset # Calculate additional features rating_to_review_ratio = ratings_count / (reviews_count + 1) log_ratings_count = np.log1p(ratings_count) log_reviews_count = np.log1p(reviews_count) # Get author book count (from training data) if author in self.preprocessor.top_authors: # This would need to be stored from training, for now use a default author_book_count = 5 # Default value else: author_book_count = 1 # Default for unknown authors # Encode author if author in self.preprocessor.author_encoder.classes_: author_encoded = self.preprocessor.author_encoder.transform([author])[0] else: # Use most common author as default author_encoded = 0 # Create feature vector features = np.array([[ ratings_count, reviews_count, rating_to_review_ratio, log_ratings_count, log_reviews_count, author_book_count, author_encoded ]]) # Scale features features_scaled = self.preprocessor.scaler.transform(features) # Make prediction prediction = self.model.predict(features_scaled)[0] return round(prediction, 2) def get_model_info(self): """Get information about the trained model""" if self.model is None: return None info = { 'model_type': type(self.model).__name__, 'top_authors': self.preprocessor.top_authors, 'feature_names': self.preprocessor.get_feature_names() } if hasattr(self.model, 'feature_importances_'): info['feature_importance'] = dict(zip( self.preprocessor.get_feature_names(), self.model.feature_importances_ )) return info def main(): """Train and evaluate the book popularity prediction model""" print("Starting book popularity model training...") # Initialize model trainer model_trainer = BookPopularityModel() # Train model success = model_trainer.train_model(model_type='random_forest') if success: print("\\nModel training completed successfully!") # Test single prediction print("\\nTesting single prediction...") test_author = "Stephen King" # Most frequent author in dataset test_ratings = 100000 test_reviews = 12000 prediction = model_trainer.predict_single_book( test_author, test_ratings, test_reviews ) print(f"Prediction for '{test_author}' book with {test_ratings:,} ratings and {test_reviews:,} reviews:") print(f"Predicted average rating: {prediction}") # Get model info model_info = model_trainer.get_model_info() print(f"\\nModel type: {model_info['model_type']}") print(f"Number of top authors: {len(model_info['top_authors'])}") if 'feature_importance' in model_info: print("\\nTop 3 most important features:") sorted_features = sorted( model_info['feature_importance'].items(), key=lambda x: x[1], reverse=True ) for feature, importance in sorted_features[:3]: print(f" {feature}: {importance:.4f}") else: print("Model training failed!") if __name__ == "__main__": main()