goodreads / src /prediction_utils.py
fguryel's picture
Deploy ML project
ce92e54
"""
Prediction utilities for the book popularity predictor web app.
This module contains helper functions for making predictions and formatting results.
"""
import joblib
import numpy as np
import pandas as pd
from typing import Optional, Dict, Any
class BookPopularityPredictor:
def __init__(self, models_dir='models'):
self.models_dir = models_dir
self.model = None
self.scaler = None
self.author_encoder = None
self.top_authors = []
self.is_loaded = False
def load_model_components(self):
"""Load all model components"""
try:
# Load model
self.model = joblib.load(f'{self.models_dir}/book_popularity_model.pkl')
# Load preprocessor components
self.scaler = joblib.load(f'{self.models_dir}/scaler.pkl')
self.author_encoder = joblib.load(f'{self.models_dir}/author_encoder.pkl')
self.top_authors = joblib.load(f'{self.models_dir}/top_authors.pkl')
self.is_loaded = True
return True
except Exception as e:
print(f"Error loading model components: {e}")
return False
def predict_book_rating(self,
author: str,
ratings_count: int,
reviews_count: Optional[int] = None) -> Dict[str, Any]:
"""
Predict the average rating for a book based on its characteristics.
Args:
author: Author name (should be from top authors list)
ratings_count: Number of ratings the book has
reviews_count: Number of reviews (optional, will be estimated if not provided)
Returns:
Dictionary containing prediction and additional information
"""
if not self.is_loaded:
return {"error": "Model not loaded. Please load model components first."}
try:
# Handle missing reviews_count
if reviews_count is None:
# Estimate based on average ratio from training data
reviews_count = max(1, int(ratings_count / 8.33))
# Calculate derived features
rating_to_review_ratio = ratings_count / (reviews_count + 1)
log_ratings_count = np.log1p(ratings_count)
log_reviews_count = np.log1p(reviews_count)
# Get author book count (simplified - in real implementation,
# this would be stored from training data)
author_book_count = self._get_author_book_count(author)
# Encode author
author_encoded = self._encode_author(author)
# Create feature vector
features = np.array([[
ratings_count,
reviews_count,
rating_to_review_ratio,
log_ratings_count,
log_reviews_count,
author_book_count,
author_encoded
]])
# Scale features
features_scaled = self.scaler.transform(features)
# Make prediction
prediction = self.model.predict(features_scaled)[0]
# Calculate confidence based on input values
confidence = self._calculate_confidence(ratings_count, author)
return {
"predicted_rating": round(prediction, 2),
"confidence": confidence,
"input_features": {
"author": author,
"ratings_count": ratings_count,
"reviews_count": reviews_count,
"estimated_reviews": reviews_count if reviews_count else True
},
"derived_features": {
"rating_to_review_ratio": round(rating_to_review_ratio, 2),
"log_ratings_count": round(log_ratings_count, 2),
"log_reviews_count": round(log_reviews_count, 2),
"author_book_count": author_book_count
}
}
except Exception as e:
return {"error": f"Prediction error: {str(e)}"}
def _encode_author(self, author: str) -> int:
"""Encode author name to numerical value"""
if author in self.author_encoder.classes_:
return self.author_encoder.transform([author])[0]
else:
# Return encoded value for the most common author as default
return self.author_encoder.transform([self.top_authors[0]])[0]
def _get_author_book_count(self, author: str) -> int:
"""Get the number of books by author in training dataset"""
# Simplified mapping - in real implementation, this would be stored
author_book_counts = {
'Stephen King': 15,
'Sarah J. Maas': 12,
'William Shakespeare': 12,
'Rick Riordan': 11,
'Cassandra Clare': 10,
'J.K. Rowling': 10,
'C.S. Lewis': 9,
'J.R.R. Tolkien': 7,
'Jane Austen': 7,
'Richelle Mead': 7,
'Dr. Seuss': 6,
'Dan Brown': 6,
'Terry Pratchett': 6,
'J.R. Ward': 6,
'Diana Gabaldon': 6,
'P.C. Cast': 6,
'Charlaine Harris': 6,
'Charles Dickens': 6,
'Neil Gaiman': 6,
'Marissa Meyer': 5
}
return author_book_counts.get(author, 2) # Default value for unknown authors
def _calculate_confidence(self, ratings_count: int, author: str) -> str:
"""Calculate prediction confidence based on input characteristics"""
confidence_score = 0
# Ratings count factor
if ratings_count >= 100000:
confidence_score += 40
elif ratings_count >= 10000:
confidence_score += 30
elif ratings_count >= 1000:
confidence_score += 20
else:
confidence_score += 10
# Author popularity factor
if author in self.top_authors[:5]: # Top 5 authors
confidence_score += 30
elif author in self.top_authors[:10]: # Top 10 authors
confidence_score += 20
elif author in self.top_authors: # Top 20 authors
confidence_score += 15
else:
confidence_score += 5
# Model performance factor (based on typical RΒ² score)
confidence_score += 20
if confidence_score >= 80:
return "High"
elif confidence_score >= 60:
return "Medium"
else:
return "Low"
def get_top_authors(self) -> list:
"""Get the list of top authors for dropdown selection"""
return self.top_authors if self.is_loaded else []
def get_rating_distribution_info(self) -> Dict[str, float]:
"""Get information about rating distribution from training data"""
return {
"min_rating": 3.04,
"max_rating": 4.81,
"mean_rating": 4.10,
"median_rating": 4.10
}
def validate_inputs(self, author: str, ratings_count: int,
reviews_count: Optional[int] = None) -> Dict[str, Any]:
"""Validate user inputs"""
errors = []
warnings = []
# Validate author
if not author:
errors.append("Author name is required")
elif author not in self.top_authors:
warnings.append(f"Author '{author}' not in top authors list. Prediction may be less accurate.")
# Validate ratings count
if ratings_count <= 0:
errors.append("Ratings count must be positive")
elif ratings_count < 100:
warnings.append("Very low ratings count may result in less reliable prediction")
elif ratings_count > 10000000:
warnings.append("Very high ratings count - are you sure this is correct?")
# Validate reviews count if provided
if reviews_count is not None:
if reviews_count < 0:
errors.append("Reviews count cannot be negative")
elif reviews_count > ratings_count:
errors.append("Reviews count cannot exceed ratings count")
elif reviews_count == 0 and ratings_count > 0:
warnings.append("No reviews but has ratings - this is unusual")
return {
"valid": len(errors) == 0,
"errors": errors,
"warnings": warnings
}
# Utility functions for the Streamlit app
def format_number(num: int) -> str:
"""Format large numbers with commas for better readability"""
return f"{num:,}"
def get_rating_color(rating: float) -> str:
"""Get color for rating display based on value"""
if rating >= 4.5:
return "green"
elif rating >= 4.0:
return "blue"
elif rating >= 3.5:
return "orange"
else:
return "red"
def get_confidence_color(confidence: str) -> str:
"""Get color for confidence display"""
colors = {
"High": "green",
"Medium": "orange",
"Low": "red"
}
return colors.get(confidence, "gray")
def generate_prediction_explanation(prediction_result: Dict[str, Any]) -> str:
"""Generate human-readable explanation of the prediction"""
if "error" in prediction_result:
return f"Error: {prediction_result['error']}"
rating = prediction_result["predicted_rating"]
confidence = prediction_result["confidence"]
author = prediction_result["input_features"]["author"]
ratings_count = prediction_result["input_features"]["ratings_count"]
explanation = f"""
Based on the machine learning model analysis:
πŸ“š **Author**: {author}
πŸ“Š **Expected Ratings**: {format_number(ratings_count)}
⭐ **Predicted Average Rating**: {rating}/5.0
🎯 **Confidence Level**: {confidence}
This prediction is based on patterns learned from 990+ books in the Goodreads dataset,
considering factors like author popularity, expected engagement levels, and historical rating patterns.
"""
if confidence == "Low":
explanation += """
⚠️ **Note**: Low confidence predictions may be less reliable.
This could be due to the author not being in our top authors list or unusual rating patterns.
"""
return explanation