| | """ |
| | Model training module for book popularity prediction. |
| | This module handles model training, evaluation, and saving. |
| | """ |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | from sklearn.ensemble import RandomForestRegressor |
| | from sklearn.linear_model import LinearRegression |
| | from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error |
| | from sklearn.model_selection import cross_val_score |
| | import joblib |
| | import os |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | from data_preprocessing import BookDataPreprocessor |
| |
|
| | class BookPopularityModel: |
| | def __init__(self): |
| | self.model = None |
| | self.preprocessor = BookDataPreprocessor() |
| | self.feature_importance = None |
| | |
| | def train_model(self, model_type='random_forest'): |
| | """Train the book popularity prediction model""" |
| | |
| | print("Loading and preprocessing data...") |
| | df = self.preprocessor.load_data() |
| | if df is None: |
| | return False |
| | |
| | df_clean = self.preprocessor.clean_data(df) |
| | df_engineered = self.preprocessor.engineer_features(df_clean) |
| | X, y, df_final = self.preprocessor.prepare_features(df_engineered, fit=True) |
| | |
| | |
| | X_train, X_test, y_train, y_test = self.preprocessor.split_data(X, y) |
| | |
| | print(f"Training data shape: {X_train.shape}") |
| | print(f"Test data shape: {X_test.shape}") |
| | |
| | |
| | if model_type == 'random_forest': |
| | self.model = RandomForestRegressor( |
| | n_estimators=100, |
| | max_depth=10, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | elif model_type == 'linear': |
| | self.model = LinearRegression() |
| | else: |
| | raise ValueError("Model type must be 'random_forest' or 'linear'") |
| | |
| | |
| | print(f"Training {model_type} model...") |
| | self.model.fit(X_train, y_train) |
| | |
| | |
| | self.evaluate_model(X_train, X_test, y_train, y_test) |
| | |
| | |
| | self.save_model() |
| | self.preprocessor.save_preprocessor() |
| | |
| | return True |
| | |
| | def evaluate_model(self, X_train, X_test, y_train, y_test): |
| | """Evaluate the trained model""" |
| | |
| | y_train_pred = self.model.predict(X_train) |
| | y_test_pred = self.model.predict(X_test) |
| | |
| | |
| | train_mse = mean_squared_error(y_train, y_train_pred) |
| | test_mse = mean_squared_error(y_test, y_test_pred) |
| | train_r2 = r2_score(y_train, y_train_pred) |
| | test_r2 = r2_score(y_test, y_test_pred) |
| | train_mae = mean_absolute_error(y_train, y_train_pred) |
| | test_mae = mean_absolute_error(y_test, y_test_pred) |
| | |
| | print("\\n" + "="*50) |
| | print("MODEL EVALUATION RESULTS") |
| | print("="*50) |
| | print(f"Train MSE: {train_mse:.4f}") |
| | print(f"Test MSE: {test_mse:.4f}") |
| | print(f"Train R²: {train_r2:.4f}") |
| | print(f"Test R²: {test_r2:.4f}") |
| | print(f"Train MAE: {train_mae:.4f}") |
| | print(f"Test MAE: {test_mae:.4f}") |
| | |
| | |
| | cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2') |
| | print(f"\\nCross-validation R² scores: {cv_scores}") |
| | print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") |
| | |
| | |
| | if hasattr(self.model, 'feature_importances_'): |
| | self.feature_importance = self.model.feature_importances_ |
| | feature_names = self.preprocessor.get_feature_names() |
| | |
| | print("\\nFeature Importance:") |
| | for name, importance in zip(feature_names, self.feature_importance): |
| | print(f"{name}: {importance:.4f}") |
| | |
| | print("="*50) |
| | |
| | def save_model(self, models_dir='models'): |
| | """Save the trained model""" |
| | if not os.path.exists(models_dir): |
| | os.makedirs(models_dir) |
| | |
| | model_path = f'{models_dir}/book_popularity_model.pkl' |
| | joblib.dump(self.model, model_path) |
| | print(f"Model saved to {model_path}") |
| | |
| | def load_model(self, models_dir='models'): |
| | """Load a previously trained model""" |
| | try: |
| | model_path = f'{models_dir}/book_popularity_model.pkl' |
| | self.model = joblib.load(model_path) |
| | |
| | |
| | self.preprocessor.load_preprocessor(models_dir) |
| | |
| | print("Model and preprocessor loaded successfully!") |
| | return True |
| | except FileNotFoundError as e: |
| | print(f"Model not found: {e}") |
| | return False |
| | |
| | def predict_single_book(self, author, ratings_count, reviews_count=None): |
| | """Predict popularity for a single book""" |
| | if self.model is None: |
| | print("Model not trained or loaded!") |
| | return None |
| | |
| | |
| | if reviews_count is None: |
| | reviews_count = ratings_count / 8.33 |
| | |
| | |
| | rating_to_review_ratio = ratings_count / (reviews_count + 1) |
| | log_ratings_count = np.log1p(ratings_count) |
| | log_reviews_count = np.log1p(reviews_count) |
| | |
| | |
| | if author in self.preprocessor.top_authors: |
| | |
| | author_book_count = 5 |
| | else: |
| | author_book_count = 1 |
| | |
| | |
| | if author in self.preprocessor.author_encoder.classes_: |
| | author_encoded = self.preprocessor.author_encoder.transform([author])[0] |
| | else: |
| | |
| | author_encoded = 0 |
| | |
| | |
| | features = np.array([[ |
| | ratings_count, reviews_count, rating_to_review_ratio, |
| | log_ratings_count, log_reviews_count, author_book_count, |
| | author_encoded |
| | ]]) |
| | |
| | |
| | features_scaled = self.preprocessor.scaler.transform(features) |
| | |
| | |
| | prediction = self.model.predict(features_scaled)[0] |
| | |
| | return round(prediction, 2) |
| | |
| | def get_model_info(self): |
| | """Get information about the trained model""" |
| | if self.model is None: |
| | return None |
| | |
| | info = { |
| | 'model_type': type(self.model).__name__, |
| | 'top_authors': self.preprocessor.top_authors, |
| | 'feature_names': self.preprocessor.get_feature_names() |
| | } |
| | |
| | if hasattr(self.model, 'feature_importances_'): |
| | info['feature_importance'] = dict(zip( |
| | self.preprocessor.get_feature_names(), |
| | self.model.feature_importances_ |
| | )) |
| | |
| | return info |
| |
|
| | def main(): |
| | """Train and evaluate the book popularity prediction model""" |
| | print("Starting book popularity model training...") |
| | |
| | |
| | model_trainer = BookPopularityModel() |
| | |
| | |
| | success = model_trainer.train_model(model_type='random_forest') |
| | |
| | if success: |
| | print("\\nModel training completed successfully!") |
| | |
| | |
| | print("\\nTesting single prediction...") |
| | test_author = "Stephen King" |
| | test_ratings = 100000 |
| | test_reviews = 12000 |
| | |
| | prediction = model_trainer.predict_single_book( |
| | test_author, test_ratings, test_reviews |
| | ) |
| | |
| | print(f"Prediction for '{test_author}' book with {test_ratings:,} ratings and {test_reviews:,} reviews:") |
| | print(f"Predicted average rating: {prediction}") |
| | |
| | |
| | model_info = model_trainer.get_model_info() |
| | print(f"\\nModel type: {model_info['model_type']}") |
| | print(f"Number of top authors: {len(model_info['top_authors'])}") |
| | |
| | if 'feature_importance' in model_info: |
| | print("\\nTop 3 most important features:") |
| | sorted_features = sorted( |
| | model_info['feature_importance'].items(), |
| | key=lambda x: x[1], |
| | reverse=True |
| | ) |
| | for feature, importance in sorted_features[:3]: |
| | print(f" {feature}: {importance:.4f}") |
| | |
| | else: |
| | print("Model training failed!") |
| |
|
| | if __name__ == "__main__": |
| | main() |