""" Prediction utilities for the book popularity predictor web app. This module contains helper functions for making predictions and formatting results. """ import joblib import numpy as np import pandas as pd from typing import Optional, Dict, Any class BookPopularityPredictor: def __init__(self, models_dir='models'): self.models_dir = models_dir self.model = None self.scaler = None self.author_encoder = None self.top_authors = [] self.is_loaded = False def load_model_components(self): """Load all model components""" try: # Load model self.model = joblib.load(f'{self.models_dir}/book_popularity_model.pkl') # Load preprocessor components self.scaler = joblib.load(f'{self.models_dir}/scaler.pkl') self.author_encoder = joblib.load(f'{self.models_dir}/author_encoder.pkl') self.top_authors = joblib.load(f'{self.models_dir}/top_authors.pkl') self.is_loaded = True return True except Exception as e: print(f"Error loading model components: {e}") return False def predict_book_rating(self, author: str, ratings_count: int, reviews_count: Optional[int] = None) -> Dict[str, Any]: """ Predict the average rating for a book based on its characteristics. Args: author: Author name (should be from top authors list) ratings_count: Number of ratings the book has reviews_count: Number of reviews (optional, will be estimated if not provided) Returns: Dictionary containing prediction and additional information """ if not self.is_loaded: return {"error": "Model not loaded. Please load model components first."} try: # Handle missing reviews_count if reviews_count is None: # Estimate based on average ratio from training data reviews_count = max(1, int(ratings_count / 8.33)) # Calculate derived features rating_to_review_ratio = ratings_count / (reviews_count + 1) log_ratings_count = np.log1p(ratings_count) log_reviews_count = np.log1p(reviews_count) # Get author book count (simplified - in real implementation, # this would be stored from training data) author_book_count = self._get_author_book_count(author) # Encode author author_encoded = self._encode_author(author) # Create feature vector features = np.array([[ ratings_count, reviews_count, rating_to_review_ratio, log_ratings_count, log_reviews_count, author_book_count, author_encoded ]]) # Scale features features_scaled = self.scaler.transform(features) # Make prediction prediction = self.model.predict(features_scaled)[0] # Calculate confidence based on input values confidence = self._calculate_confidence(ratings_count, author) return { "predicted_rating": round(prediction, 2), "confidence": confidence, "input_features": { "author": author, "ratings_count": ratings_count, "reviews_count": reviews_count, "estimated_reviews": reviews_count if reviews_count else True }, "derived_features": { "rating_to_review_ratio": round(rating_to_review_ratio, 2), "log_ratings_count": round(log_ratings_count, 2), "log_reviews_count": round(log_reviews_count, 2), "author_book_count": author_book_count } } except Exception as e: return {"error": f"Prediction error: {str(e)}"} def _encode_author(self, author: str) -> int: """Encode author name to numerical value""" if author in self.author_encoder.classes_: return self.author_encoder.transform([author])[0] else: # Return encoded value for the most common author as default return self.author_encoder.transform([self.top_authors[0]])[0] def _get_author_book_count(self, author: str) -> int: """Get the number of books by author in training dataset""" # Simplified mapping - in real implementation, this would be stored author_book_counts = { 'Stephen King': 15, 'Sarah J. Maas': 12, 'William Shakespeare': 12, 'Rick Riordan': 11, 'Cassandra Clare': 10, 'J.K. Rowling': 10, 'C.S. Lewis': 9, 'J.R.R. Tolkien': 7, 'Jane Austen': 7, 'Richelle Mead': 7, 'Dr. Seuss': 6, 'Dan Brown': 6, 'Terry Pratchett': 6, 'J.R. Ward': 6, 'Diana Gabaldon': 6, 'P.C. Cast': 6, 'Charlaine Harris': 6, 'Charles Dickens': 6, 'Neil Gaiman': 6, 'Marissa Meyer': 5 } return author_book_counts.get(author, 2) # Default value for unknown authors def _calculate_confidence(self, ratings_count: int, author: str) -> str: """Calculate prediction confidence based on input characteristics""" confidence_score = 0 # Ratings count factor if ratings_count >= 100000: confidence_score += 40 elif ratings_count >= 10000: confidence_score += 30 elif ratings_count >= 1000: confidence_score += 20 else: confidence_score += 10 # Author popularity factor if author in self.top_authors[:5]: # Top 5 authors confidence_score += 30 elif author in self.top_authors[:10]: # Top 10 authors confidence_score += 20 elif author in self.top_authors: # Top 20 authors confidence_score += 15 else: confidence_score += 5 # Model performance factor (based on typical R² score) confidence_score += 20 if confidence_score >= 80: return "High" elif confidence_score >= 60: return "Medium" else: return "Low" def get_top_authors(self) -> list: """Get the list of top authors for dropdown selection""" return self.top_authors if self.is_loaded else [] def get_rating_distribution_info(self) -> Dict[str, float]: """Get information about rating distribution from training data""" return { "min_rating": 3.04, "max_rating": 4.81, "mean_rating": 4.10, "median_rating": 4.10 } def validate_inputs(self, author: str, ratings_count: int, reviews_count: Optional[int] = None) -> Dict[str, Any]: """Validate user inputs""" errors = [] warnings = [] # Validate author if not author: errors.append("Author name is required") elif author not in self.top_authors: warnings.append(f"Author '{author}' not in top authors list. Prediction may be less accurate.") # Validate ratings count if ratings_count <= 0: errors.append("Ratings count must be positive") elif ratings_count < 100: warnings.append("Very low ratings count may result in less reliable prediction") elif ratings_count > 10000000: warnings.append("Very high ratings count - are you sure this is correct?") # Validate reviews count if provided if reviews_count is not None: if reviews_count < 0: errors.append("Reviews count cannot be negative") elif reviews_count > ratings_count: errors.append("Reviews count cannot exceed ratings count") elif reviews_count == 0 and ratings_count > 0: warnings.append("No reviews but has ratings - this is unusual") return { "valid": len(errors) == 0, "errors": errors, "warnings": warnings } # Utility functions for the Streamlit app def format_number(num: int) -> str: """Format large numbers with commas for better readability""" return f"{num:,}" def get_rating_color(rating: float) -> str: """Get color for rating display based on value""" if rating >= 4.5: return "green" elif rating >= 4.0: return "blue" elif rating >= 3.5: return "orange" else: return "red" def get_confidence_color(confidence: str) -> str: """Get color for confidence display""" colors = { "High": "green", "Medium": "orange", "Low": "red" } return colors.get(confidence, "gray") def generate_prediction_explanation(prediction_result: Dict[str, Any]) -> str: """Generate human-readable explanation of the prediction""" if "error" in prediction_result: return f"Error: {prediction_result['error']}" rating = prediction_result["predicted_rating"] confidence = prediction_result["confidence"] author = prediction_result["input_features"]["author"] ratings_count = prediction_result["input_features"]["ratings_count"] explanation = f""" Based on the machine learning model analysis: 📚 **Author**: {author} 📊 **Expected Ratings**: {format_number(ratings_count)} ⭐ **Predicted Average Rating**: {rating}/5.0 🎯 **Confidence Level**: {confidence} This prediction is based on patterns learned from 990+ books in the Goodreads dataset, considering factors like author popularity, expected engagement levels, and historical rating patterns. """ if confidence == "Low": explanation += """ ⚠️ **Note**: Low confidence predictions may be less reliable. This could be due to the author not being in our top authors list or unusual rating patterns. """ return explanation