"""
Model training module for book popularity prediction.
This module handles model training, evaluation, and saving.
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from data_preprocessing import BookDataPreprocessor

class BookPopularityModel:
    def __init__(self):
        self.model = None
        self.preprocessor = BookDataPreprocessor()
        self.feature_importance = None
        
    def train_model(self, model_type='random_forest'):
        """Train the book popularity prediction model"""
        # Load and preprocess data
        print("Loading and preprocessing data...")
        df = self.preprocessor.load_data()
        if df is None:
            return False
            
        df_clean = self.preprocessor.clean_data(df)
        df_engineered = self.preprocessor.engineer_features(df_clean)
        X, y, df_final = self.preprocessor.prepare_features(df_engineered, fit=True)
        
        # Split data
        X_train, X_test, y_train, y_test = self.preprocessor.split_data(X, y)
        
        print(f"Training data shape: {X_train.shape}")
        print(f"Test data shape: {X_test.shape}")
        
        # Initialize model
        if model_type == 'random_forest':
            self.model = RandomForestRegressor(
                n_estimators=100,
                max_depth=10,
                random_state=42,
                n_jobs=-1
            )
        elif model_type == 'linear':
            self.model = LinearRegression()
        else:
            raise ValueError("Model type must be 'random_forest' or 'linear'")
        
        # Train model
        print(f"Training {model_type} model...")
        self.model.fit(X_train, y_train)
        
        # Evaluate model
        self.evaluate_model(X_train, X_test, y_train, y_test)
        
        # Save model and preprocessor
        self.save_model()
        self.preprocessor.save_preprocessor()
        
        return True
    
    def evaluate_model(self, X_train, X_test, y_train, y_test):
        """Evaluate the trained model"""
        # Predictions
        y_train_pred = self.model.predict(X_train)
        y_test_pred = self.model.predict(X_test)
        
        # Calculate metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        
        print("\\n" + "="*50)
        print("MODEL EVALUATION RESULTS")
        print("="*50)
        print(f"Train MSE: {train_mse:.4f}")
        print(f"Test MSE:  {test_mse:.4f}")
        print(f"Train R²:  {train_r2:.4f}")
        print(f"Test R²:   {test_r2:.4f}")
        print(f"Train MAE: {train_mae:.4f}")
        print(f"Test MAE:  {test_mae:.4f}")
        
        # Cross-validation score
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2')
        print(f"\\nCross-validation R² scores: {cv_scores}")
        print(f"Mean CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        # Feature importance (if available)
        if hasattr(self.model, 'feature_importances_'):
            self.feature_importance = self.model.feature_importances_
            feature_names = self.preprocessor.get_feature_names()
            
            print("\\nFeature Importance:")
            for name, importance in zip(feature_names, self.feature_importance):
                print(f"{name}: {importance:.4f}")
        
        print("="*50)
        
    def save_model(self, models_dir='models'):
        """Save the trained model"""
        if not os.path.exists(models_dir):
            os.makedirs(models_dir)
            
        model_path = f'{models_dir}/book_popularity_model.pkl'
        joblib.dump(self.model, model_path)
        print(f"Model saved to {model_path}")
        
    def load_model(self, models_dir='models'):
        """Load a previously trained model"""
        try:
            model_path = f'{models_dir}/book_popularity_model.pkl'
            self.model = joblib.load(model_path)
            
            # Also load preprocessor
            self.preprocessor.load_preprocessor(models_dir)
            
            print("Model and preprocessor loaded successfully!")
            return True
        except FileNotFoundError as e:
            print(f"Model not found: {e}")
            return False
            
    def predict_single_book(self, author, ratings_count, reviews_count=None):
        """Predict popularity for a single book"""
        if self.model is None:
            print("Model not trained or loaded!")
            return None
            
        # Handle missing reviews_count
        if reviews_count is None:
            reviews_count = ratings_count / 8.33  # Average ratio from dataset
            
        # Calculate additional features
        rating_to_review_ratio = ratings_count / (reviews_count + 1)
        log_ratings_count = np.log1p(ratings_count)
        log_reviews_count = np.log1p(reviews_count)
        
        # Get author book count (from training data)
        if author in self.preprocessor.top_authors:
            # This would need to be stored from training, for now use a default
            author_book_count = 5  # Default value
        else:
            author_book_count = 1  # Default for unknown authors
            
        # Encode author
        if author in self.preprocessor.author_encoder.classes_:
            author_encoded = self.preprocessor.author_encoder.transform([author])[0]
        else:
            # Use most common author as default
            author_encoded = 0
            
        # Create feature vector
        features = np.array([[
            ratings_count, reviews_count, rating_to_review_ratio,
            log_ratings_count, log_reviews_count, author_book_count,
            author_encoded
        ]])
        
        # Scale features
        features_scaled = self.preprocessor.scaler.transform(features)
        
        # Make prediction
        prediction = self.model.predict(features_scaled)[0]
        
        return round(prediction, 2)
    
    def get_model_info(self):
        """Get information about the trained model"""
        if self.model is None:
            return None
            
        info = {
            'model_type': type(self.model).__name__,
            'top_authors': self.preprocessor.top_authors,
            'feature_names': self.preprocessor.get_feature_names()
        }
        
        if hasattr(self.model, 'feature_importances_'):
            info['feature_importance'] = dict(zip(
                self.preprocessor.get_feature_names(),
                self.model.feature_importances_
            ))
            
        return info

def main():
    """Train and evaluate the book popularity prediction model"""
    print("Starting book popularity model training...")
    
    # Initialize model trainer
    model_trainer = BookPopularityModel()
    
    # Train model
    success = model_trainer.train_model(model_type='random_forest')
    
    if success:
        print("\\nModel training completed successfully!")
        
        # Test single prediction
        print("\\nTesting single prediction...")
        test_author = "Stephen King"  # Most frequent author in dataset
        test_ratings = 100000
        test_reviews = 12000
        
        prediction = model_trainer.predict_single_book(
            test_author, test_ratings, test_reviews
        )
        
        print(f"Prediction for '{test_author}' book with {test_ratings:,} ratings and {test_reviews:,} reviews:")
        print(f"Predicted average rating: {prediction}")
        
        # Get model info
        model_info = model_trainer.get_model_info()
        print(f"\\nModel type: {model_info['model_type']}")
        print(f"Number of top authors: {len(model_info['top_authors'])}")
        
        if 'feature_importance' in model_info:
            print("\\nTop 3 most important features:")
            sorted_features = sorted(
                model_info['feature_importance'].items(),
                key=lambda x: x[1],
                reverse=True
            )
            for feature, importance in sorted_features[:3]:
                print(f"  {feature}: {importance:.4f}")
    
    else:
        print("Model training failed!")

if __name__ == "__main__":
    main()