"""
Prediction utilities for the book popularity predictor web app.
This module contains helper functions for making predictions and formatting results.
"""

import joblib
import numpy as np
import pandas as pd
from typing import Optional, Dict, Any

class BookPopularityPredictor:
    def __init__(self, models_dir='models'):
        self.models_dir = models_dir
        self.model = None
        self.scaler = None
        self.author_encoder = None
        self.top_authors = []
        self.is_loaded = False
        
    def load_model_components(self):
        """Load all model components"""
        try:
            # Load model
            self.model = joblib.load(f'{self.models_dir}/book_popularity_model.pkl')
            
            # Load preprocessor components
            self.scaler = joblib.load(f'{self.models_dir}/scaler.pkl')
            self.author_encoder = joblib.load(f'{self.models_dir}/author_encoder.pkl')
            self.top_authors = joblib.load(f'{self.models_dir}/top_authors.pkl')
            
            self.is_loaded = True
            return True
            
        except Exception as e:
            print(f"Error loading model components: {e}")
            return False
    
    def predict_book_rating(self, 
                          author: str, 
                          ratings_count: int, 
                          reviews_count: Optional[int] = None) -> Dict[str, Any]:
        """
        Predict the average rating for a book based on its characteristics.
        
        Args:
            author: Author name (should be from top authors list)
            ratings_count: Number of ratings the book has
            reviews_count: Number of reviews (optional, will be estimated if not provided)
        
        Returns:
            Dictionary containing prediction and additional information
        """
        if not self.is_loaded:
            return {"error": "Model not loaded. Please load model components first."}
        
        try:
            # Handle missing reviews_count
            if reviews_count is None:
                # Estimate based on average ratio from training data
                reviews_count = max(1, int(ratings_count / 8.33))
            
            # Calculate derived features
            rating_to_review_ratio = ratings_count / (reviews_count + 1)
            log_ratings_count = np.log1p(ratings_count)
            log_reviews_count = np.log1p(reviews_count)
            
            # Get author book count (simplified - in real implementation, 
            # this would be stored from training data)
            author_book_count = self._get_author_book_count(author)
            
            # Encode author
            author_encoded = self._encode_author(author)
            
            # Create feature vector
            features = np.array([[
                ratings_count,
                reviews_count,
                rating_to_review_ratio,
                log_ratings_count,
                log_reviews_count,
                author_book_count,
                author_encoded
            ]])
            
            # Scale features
            features_scaled = self.scaler.transform(features)
            
            # Make prediction
            prediction = self.model.predict(features_scaled)[0]
            
            # Calculate confidence based on input values
            confidence = self._calculate_confidence(ratings_count, author)
            
            return {
                "predicted_rating": round(prediction, 2),
                "confidence": confidence,
                "input_features": {
                    "author": author,
                    "ratings_count": ratings_count,
                    "reviews_count": reviews_count,
                    "estimated_reviews": reviews_count if reviews_count else True
                },
                "derived_features": {
                    "rating_to_review_ratio": round(rating_to_review_ratio, 2),
                    "log_ratings_count": round(log_ratings_count, 2),
                    "log_reviews_count": round(log_reviews_count, 2),
                    "author_book_count": author_book_count
                }
            }
            
        except Exception as e:
            return {"error": f"Prediction error: {str(e)}"}
    
    def _encode_author(self, author: str) -> int:
        """Encode author name to numerical value"""
        if author in self.author_encoder.classes_:
            return self.author_encoder.transform([author])[0]
        else:
            # Return encoded value for the most common author as default
            return self.author_encoder.transform([self.top_authors[0]])[0]
    
    def _get_author_book_count(self, author: str) -> int:
        """Get the number of books by author in training dataset"""
        # Simplified mapping - in real implementation, this would be stored
        author_book_counts = {
            'Stephen King': 15,
            'Sarah J. Maas': 12,
            'William Shakespeare': 12,
            'Rick Riordan': 11,
            'Cassandra Clare': 10,
            'J.K. Rowling': 10,
            'C.S. Lewis': 9,
            'J.R.R. Tolkien': 7,
            'Jane Austen': 7,
            'Richelle Mead': 7,
            'Dr. Seuss': 6,
            'Dan Brown': 6,
            'Terry Pratchett': 6,
            'J.R. Ward': 6,
            'Diana Gabaldon': 6,
            'P.C. Cast': 6,
            'Charlaine Harris': 6,
            'Charles Dickens': 6,
            'Neil Gaiman': 6,
            'Marissa Meyer': 5
        }
        return author_book_counts.get(author, 2)  # Default value for unknown authors
    
    def _calculate_confidence(self, ratings_count: int, author: str) -> str:
        """Calculate prediction confidence based on input characteristics"""
        confidence_score = 0
        
        # Ratings count factor
        if ratings_count >= 100000:
            confidence_score += 40
        elif ratings_count >= 10000:
            confidence_score += 30
        elif ratings_count >= 1000:
            confidence_score += 20
        else:
            confidence_score += 10
        
        # Author popularity factor
        if author in self.top_authors[:5]:  # Top 5 authors
            confidence_score += 30
        elif author in self.top_authors[:10]:  # Top 10 authors
            confidence_score += 20
        elif author in self.top_authors:  # Top 20 authors
            confidence_score += 15
        else:
            confidence_score += 5
        
        # Model performance factor (based on typical R² score)
        confidence_score += 20
        
        if confidence_score >= 80:
            return "High"
        elif confidence_score >= 60:
            return "Medium"
        else:
            return "Low"
    
    def get_top_authors(self) -> list:
        """Get the list of top authors for dropdown selection"""
        return self.top_authors if self.is_loaded else []
    
    def get_rating_distribution_info(self) -> Dict[str, float]:
        """Get information about rating distribution from training data"""
        return {
            "min_rating": 3.04,
            "max_rating": 4.81,
            "mean_rating": 4.10,
            "median_rating": 4.10
        }
    
    def validate_inputs(self, author: str, ratings_count: int, 
                       reviews_count: Optional[int] = None) -> Dict[str, Any]:
        """Validate user inputs"""
        errors = []
        warnings = []
        
        # Validate author
        if not author:
            errors.append("Author name is required")
        elif author not in self.top_authors:
            warnings.append(f"Author '{author}' not in top authors list. Prediction may be less accurate.")
        
        # Validate ratings count
        if ratings_count <= 0:
            errors.append("Ratings count must be positive")
        elif ratings_count < 100:
            warnings.append("Very low ratings count may result in less reliable prediction")
        elif ratings_count > 10000000:
            warnings.append("Very high ratings count - are you sure this is correct?")
        
        # Validate reviews count if provided
        if reviews_count is not None:
            if reviews_count < 0:
                errors.append("Reviews count cannot be negative")
            elif reviews_count > ratings_count:
                errors.append("Reviews count cannot exceed ratings count")
            elif reviews_count == 0 and ratings_count > 0:
                warnings.append("No reviews but has ratings - this is unusual")
        
        return {
            "valid": len(errors) == 0,
            "errors": errors,
            "warnings": warnings
        }

# Utility functions for the Streamlit app
def format_number(num: int) -> str:
    """Format large numbers with commas for better readability"""
    return f"{num:,}"

def get_rating_color(rating: float) -> str:
    """Get color for rating display based on value"""
    if rating >= 4.5:
        return "green"
    elif rating >= 4.0:
        return "blue"
    elif rating >= 3.5:
        return "orange"
    else:
        return "red"

def get_confidence_color(confidence: str) -> str:
    """Get color for confidence display"""
    colors = {
        "High": "green",
        "Medium": "orange",
        "Low": "red"
    }
    return colors.get(confidence, "gray")

def generate_prediction_explanation(prediction_result: Dict[str, Any]) -> str:
    """Generate human-readable explanation of the prediction"""
    if "error" in prediction_result:
        return f"Error: {prediction_result['error']}"
    
    rating = prediction_result["predicted_rating"]
    confidence = prediction_result["confidence"]
    author = prediction_result["input_features"]["author"]
    ratings_count = prediction_result["input_features"]["ratings_count"]
    
    explanation = f"""
    Based on the machine learning model analysis:
    
    📚 **Author**: {author}
    📊 **Expected Ratings**: {format_number(ratings_count)}
    ⭐ **Predicted Average Rating**: {rating}/5.0
    🎯 **Confidence Level**: {confidence}
    
    This prediction is based on patterns learned from 990+ books in the Goodreads dataset, 
    considering factors like author popularity, expected engagement levels, and historical rating patterns.
    """
    
    if confidence == "Low":
        explanation += """
        
        ⚠️ **Note**: Low confidence predictions may be less reliable. 
        This could be due to the author not being in our top authors list or unusual rating patterns.
        """
    
    return explanation