| | """ |
| | Prediction utilities for the book popularity predictor web app. |
| | This module contains helper functions for making predictions and formatting results. |
| | """ |
| |
|
| | import joblib |
| | import numpy as np |
| | import pandas as pd |
| | from typing import Optional, Dict, Any |
| |
|
| | class BookPopularityPredictor: |
| | def __init__(self, models_dir='models'): |
| | self.models_dir = models_dir |
| | self.model = None |
| | self.scaler = None |
| | self.author_encoder = None |
| | self.top_authors = [] |
| | self.is_loaded = False |
| | |
| | def load_model_components(self): |
| | """Load all model components""" |
| | try: |
| | |
| | self.model = joblib.load(f'{self.models_dir}/book_popularity_model.pkl') |
| | |
| | |
| | self.scaler = joblib.load(f'{self.models_dir}/scaler.pkl') |
| | self.author_encoder = joblib.load(f'{self.models_dir}/author_encoder.pkl') |
| | self.top_authors = joblib.load(f'{self.models_dir}/top_authors.pkl') |
| | |
| | self.is_loaded = True |
| | return True |
| | |
| | except Exception as e: |
| | print(f"Error loading model components: {e}") |
| | return False |
| | |
| | def predict_book_rating(self, |
| | author: str, |
| | ratings_count: int, |
| | reviews_count: Optional[int] = None) -> Dict[str, Any]: |
| | """ |
| | Predict the average rating for a book based on its characteristics. |
| | |
| | Args: |
| | author: Author name (should be from top authors list) |
| | ratings_count: Number of ratings the book has |
| | reviews_count: Number of reviews (optional, will be estimated if not provided) |
| | |
| | Returns: |
| | Dictionary containing prediction and additional information |
| | """ |
| | if not self.is_loaded: |
| | return {"error": "Model not loaded. Please load model components first."} |
| | |
| | try: |
| | |
| | if reviews_count is None: |
| | |
| | reviews_count = max(1, int(ratings_count / 8.33)) |
| | |
| | |
| | rating_to_review_ratio = ratings_count / (reviews_count + 1) |
| | log_ratings_count = np.log1p(ratings_count) |
| | log_reviews_count = np.log1p(reviews_count) |
| | |
| | |
| | |
| | author_book_count = self._get_author_book_count(author) |
| | |
| | |
| | author_encoded = self._encode_author(author) |
| | |
| | |
| | features = np.array([[ |
| | ratings_count, |
| | reviews_count, |
| | rating_to_review_ratio, |
| | log_ratings_count, |
| | log_reviews_count, |
| | author_book_count, |
| | author_encoded |
| | ]]) |
| | |
| | |
| | features_scaled = self.scaler.transform(features) |
| | |
| | |
| | prediction = self.model.predict(features_scaled)[0] |
| | |
| | |
| | confidence = self._calculate_confidence(ratings_count, author) |
| | |
| | return { |
| | "predicted_rating": round(prediction, 2), |
| | "confidence": confidence, |
| | "input_features": { |
| | "author": author, |
| | "ratings_count": ratings_count, |
| | "reviews_count": reviews_count, |
| | "estimated_reviews": reviews_count if reviews_count else True |
| | }, |
| | "derived_features": { |
| | "rating_to_review_ratio": round(rating_to_review_ratio, 2), |
| | "log_ratings_count": round(log_ratings_count, 2), |
| | "log_reviews_count": round(log_reviews_count, 2), |
| | "author_book_count": author_book_count |
| | } |
| | } |
| | |
| | except Exception as e: |
| | return {"error": f"Prediction error: {str(e)}"} |
| | |
| | def _encode_author(self, author: str) -> int: |
| | """Encode author name to numerical value""" |
| | if author in self.author_encoder.classes_: |
| | return self.author_encoder.transform([author])[0] |
| | else: |
| | |
| | return self.author_encoder.transform([self.top_authors[0]])[0] |
| | |
| | def _get_author_book_count(self, author: str) -> int: |
| | """Get the number of books by author in training dataset""" |
| | |
| | author_book_counts = { |
| | 'Stephen King': 15, |
| | 'Sarah J. Maas': 12, |
| | 'William Shakespeare': 12, |
| | 'Rick Riordan': 11, |
| | 'Cassandra Clare': 10, |
| | 'J.K. Rowling': 10, |
| | 'C.S. Lewis': 9, |
| | 'J.R.R. Tolkien': 7, |
| | 'Jane Austen': 7, |
| | 'Richelle Mead': 7, |
| | 'Dr. Seuss': 6, |
| | 'Dan Brown': 6, |
| | 'Terry Pratchett': 6, |
| | 'J.R. Ward': 6, |
| | 'Diana Gabaldon': 6, |
| | 'P.C. Cast': 6, |
| | 'Charlaine Harris': 6, |
| | 'Charles Dickens': 6, |
| | 'Neil Gaiman': 6, |
| | 'Marissa Meyer': 5 |
| | } |
| | return author_book_counts.get(author, 2) |
| | |
| | def _calculate_confidence(self, ratings_count: int, author: str) -> str: |
| | """Calculate prediction confidence based on input characteristics""" |
| | confidence_score = 0 |
| | |
| | |
| | if ratings_count >= 100000: |
| | confidence_score += 40 |
| | elif ratings_count >= 10000: |
| | confidence_score += 30 |
| | elif ratings_count >= 1000: |
| | confidence_score += 20 |
| | else: |
| | confidence_score += 10 |
| | |
| | |
| | if author in self.top_authors[:5]: |
| | confidence_score += 30 |
| | elif author in self.top_authors[:10]: |
| | confidence_score += 20 |
| | elif author in self.top_authors: |
| | confidence_score += 15 |
| | else: |
| | confidence_score += 5 |
| | |
| | |
| | confidence_score += 20 |
| | |
| | if confidence_score >= 80: |
| | return "High" |
| | elif confidence_score >= 60: |
| | return "Medium" |
| | else: |
| | return "Low" |
| | |
| | def get_top_authors(self) -> list: |
| | """Get the list of top authors for dropdown selection""" |
| | return self.top_authors if self.is_loaded else [] |
| | |
| | def get_rating_distribution_info(self) -> Dict[str, float]: |
| | """Get information about rating distribution from training data""" |
| | return { |
| | "min_rating": 3.04, |
| | "max_rating": 4.81, |
| | "mean_rating": 4.10, |
| | "median_rating": 4.10 |
| | } |
| | |
| | def validate_inputs(self, author: str, ratings_count: int, |
| | reviews_count: Optional[int] = None) -> Dict[str, Any]: |
| | """Validate user inputs""" |
| | errors = [] |
| | warnings = [] |
| | |
| | |
| | if not author: |
| | errors.append("Author name is required") |
| | elif author not in self.top_authors: |
| | warnings.append(f"Author '{author}' not in top authors list. Prediction may be less accurate.") |
| | |
| | |
| | if ratings_count <= 0: |
| | errors.append("Ratings count must be positive") |
| | elif ratings_count < 100: |
| | warnings.append("Very low ratings count may result in less reliable prediction") |
| | elif ratings_count > 10000000: |
| | warnings.append("Very high ratings count - are you sure this is correct?") |
| | |
| | |
| | if reviews_count is not None: |
| | if reviews_count < 0: |
| | errors.append("Reviews count cannot be negative") |
| | elif reviews_count > ratings_count: |
| | errors.append("Reviews count cannot exceed ratings count") |
| | elif reviews_count == 0 and ratings_count > 0: |
| | warnings.append("No reviews but has ratings - this is unusual") |
| | |
| | return { |
| | "valid": len(errors) == 0, |
| | "errors": errors, |
| | "warnings": warnings |
| | } |
| |
|
| | |
| | def format_number(num: int) -> str: |
| | """Format large numbers with commas for better readability""" |
| | return f"{num:,}" |
| |
|
| | def get_rating_color(rating: float) -> str: |
| | """Get color for rating display based on value""" |
| | if rating >= 4.5: |
| | return "green" |
| | elif rating >= 4.0: |
| | return "blue" |
| | elif rating >= 3.5: |
| | return "orange" |
| | else: |
| | return "red" |
| |
|
| | def get_confidence_color(confidence: str) -> str: |
| | """Get color for confidence display""" |
| | colors = { |
| | "High": "green", |
| | "Medium": "orange", |
| | "Low": "red" |
| | } |
| | return colors.get(confidence, "gray") |
| |
|
| | def generate_prediction_explanation(prediction_result: Dict[str, Any]) -> str: |
| | """Generate human-readable explanation of the prediction""" |
| | if "error" in prediction_result: |
| | return f"Error: {prediction_result['error']}" |
| | |
| | rating = prediction_result["predicted_rating"] |
| | confidence = prediction_result["confidence"] |
| | author = prediction_result["input_features"]["author"] |
| | ratings_count = prediction_result["input_features"]["ratings_count"] |
| | |
| | explanation = f""" |
| | Based on the machine learning model analysis: |
| | |
| | π **Author**: {author} |
| | π **Expected Ratings**: {format_number(ratings_count)} |
| | β **Predicted Average Rating**: {rating}/5.0 |
| | π― **Confidence Level**: {confidence} |
| | |
| | This prediction is based on patterns learned from 990+ books in the Goodreads dataset, |
| | considering factors like author popularity, expected engagement levels, and historical rating patterns. |
| | """ |
| | |
| | if confidence == "Low": |
| | explanation += """ |
| | |
| | β οΈ **Note**: Low confidence predictions may be less reliable. |
| | This could be due to the author not being in our top authors list or unusual rating patterns. |
| | """ |
| | |
| | return explanation |