Spaces:

fguryel
/

goodreads

Sleeping

App Files Files Community

goodreads / src /prediction_utils.py

fguryel

Deploy ML project

ce92e54 5 months ago

raw

history blame contribute delete

10.7 kB

	"""
	Prediction utilities for the book popularity predictor web app.
	This module contains helper functions for making predictions and formatting results.
	"""

	import joblib
	import numpy as np
	import pandas as pd
	from typing import Optional, Dict, Any

	class BookPopularityPredictor:
	def __init__(self, models_dir='models'):
	self.models_dir = models_dir
	self.model = None
	self.scaler = None
	self.author_encoder = None
	self.top_authors = []
	self.is_loaded = False

	def load_model_components(self):
	"""Load all model components"""
	try:
	# Load model
	self.model = joblib.load(f'{self.models_dir}/book_popularity_model.pkl')

	# Load preprocessor components
	self.scaler = joblib.load(f'{self.models_dir}/scaler.pkl')
	self.author_encoder = joblib.load(f'{self.models_dir}/author_encoder.pkl')
	self.top_authors = joblib.load(f'{self.models_dir}/top_authors.pkl')

	self.is_loaded = True
	return True

	except Exception as e:
	print(f"Error loading model components: {e}")
	return False

	def predict_book_rating(self,
	author: str,
	ratings_count: int,
	reviews_count: Optional[int] = None) -> Dict[str, Any]:
	"""
	Predict the average rating for a book based on its characteristics.

	Args:
	author: Author name (should be from top authors list)
	ratings_count: Number of ratings the book has
	reviews_count: Number of reviews (optional, will be estimated if not provided)

	Returns:
	Dictionary containing prediction and additional information
	"""
	if not self.is_loaded:
	return {"error": "Model not loaded. Please load model components first."}

	try:
	# Handle missing reviews_count
	if reviews_count is None:
	# Estimate based on average ratio from training data
	reviews_count = max(1, int(ratings_count / 8.33))

	# Calculate derived features
	rating_to_review_ratio = ratings_count / (reviews_count + 1)
	log_ratings_count = np.log1p(ratings_count)
	log_reviews_count = np.log1p(reviews_count)

	# Get author book count (simplified - in real implementation,
	# this would be stored from training data)
	author_book_count = self._get_author_book_count(author)

	# Encode author
	author_encoded = self._encode_author(author)

	# Create feature vector
	features = np.array([[
	ratings_count,
	reviews_count,
	rating_to_review_ratio,
	log_ratings_count,
	log_reviews_count,
	author_book_count,
	author_encoded
	]])

	# Scale features
	features_scaled = self.scaler.transform(features)

	# Make prediction
	prediction = self.model.predict(features_scaled)[0]

	# Calculate confidence based on input values
	confidence = self._calculate_confidence(ratings_count, author)

	return {
	"predicted_rating": round(prediction, 2),
	"confidence": confidence,
	"input_features": {
	"author": author,
	"ratings_count": ratings_count,
	"reviews_count": reviews_count,
	"estimated_reviews": reviews_count if reviews_count else True
	},
	"derived_features": {
	"rating_to_review_ratio": round(rating_to_review_ratio, 2),
	"log_ratings_count": round(log_ratings_count, 2),
	"log_reviews_count": round(log_reviews_count, 2),
	"author_book_count": author_book_count
	}
	}

	except Exception as e:
	return {"error": f"Prediction error: {str(e)}"}

	def _encode_author(self, author: str) -> int:
	"""Encode author name to numerical value"""
	if author in self.author_encoder.classes_:
	return self.author_encoder.transform([author])[0]
	else:
	# Return encoded value for the most common author as default
	return self.author_encoder.transform([self.top_authors[0]])[0]

	def _get_author_book_count(self, author: str) -> int:
	"""Get the number of books by author in training dataset"""
	# Simplified mapping - in real implementation, this would be stored
	author_book_counts = {
	'Stephen King': 15,
	'Sarah J. Maas': 12,
	'William Shakespeare': 12,
	'Rick Riordan': 11,
	'Cassandra Clare': 10,
	'J.K. Rowling': 10,
	'C.S. Lewis': 9,
	'J.R.R. Tolkien': 7,
	'Jane Austen': 7,
	'Richelle Mead': 7,
	'Dr. Seuss': 6,
	'Dan Brown': 6,
	'Terry Pratchett': 6,
	'J.R. Ward': 6,
	'Diana Gabaldon': 6,
	'P.C. Cast': 6,
	'Charlaine Harris': 6,
	'Charles Dickens': 6,
	'Neil Gaiman': 6,
	'Marissa Meyer': 5
	}
	return author_book_counts.get(author, 2) # Default value for unknown authors

	def _calculate_confidence(self, ratings_count: int, author: str) -> str:
	"""Calculate prediction confidence based on input characteristics"""
	confidence_score = 0

	# Ratings count factor
	if ratings_count >= 100000:
	confidence_score += 40
	elif ratings_count >= 10000:
	confidence_score += 30
	elif ratings_count >= 1000:
	confidence_score += 20
	else:
	confidence_score += 10

	# Author popularity factor
	if author in self.top_authors[:5]: # Top 5 authors
	confidence_score += 30
	elif author in self.top_authors[:10]: # Top 10 authors
	confidence_score += 20
	elif author in self.top_authors: # Top 20 authors
	confidence_score += 15
	else:
	confidence_score += 5

	# Model performance factor (based on typical R² score)
	confidence_score += 20

	if confidence_score >= 80:
	return "High"
	elif confidence_score >= 60:
	return "Medium"
	else:
	return "Low"

	def get_top_authors(self) -> list:
	"""Get the list of top authors for dropdown selection"""
	return self.top_authors if self.is_loaded else []

	def get_rating_distribution_info(self) -> Dict[str, float]:
	"""Get information about rating distribution from training data"""
	return {
	"min_rating": 3.04,
	"max_rating": 4.81,
	"mean_rating": 4.10,
	"median_rating": 4.10
	}

	def validate_inputs(self, author: str, ratings_count: int,
	reviews_count: Optional[int] = None) -> Dict[str, Any]:
	"""Validate user inputs"""
	errors = []
	warnings = []

	# Validate author
	if not author:
	errors.append("Author name is required")
	elif author not in self.top_authors:
	warnings.append(f"Author '{author}' not in top authors list. Prediction may be less accurate.")

	# Validate ratings count
	if ratings_count <= 0:
	errors.append("Ratings count must be positive")
	elif ratings_count < 100:
	warnings.append("Very low ratings count may result in less reliable prediction")
	elif ratings_count > 10000000:
	warnings.append("Very high ratings count - are you sure this is correct?")

	# Validate reviews count if provided
	if reviews_count is not None:
	if reviews_count < 0:
	errors.append("Reviews count cannot be negative")
	elif reviews_count > ratings_count:
	errors.append("Reviews count cannot exceed ratings count")
	elif reviews_count == 0 and ratings_count > 0:
	warnings.append("No reviews but has ratings - this is unusual")

	return {
	"valid": len(errors) == 0,
	"errors": errors,
	"warnings": warnings
	}

	# Utility functions for the Streamlit app
	def format_number(num: int) -> str:
	"""Format large numbers with commas for better readability"""
	return f"{num:,}"

	def get_rating_color(rating: float) -> str:
	"""Get color for rating display based on value"""
	if rating >= 4.5:
	return "green"
	elif rating >= 4.0:
	return "blue"
	elif rating >= 3.5:
	return "orange"
	else:
	return "red"

	def get_confidence_color(confidence: str) -> str:
	"""Get color for confidence display"""
	colors = {
	"High": "green",
	"Medium": "orange",
	"Low": "red"
	}
	return colors.get(confidence, "gray")

	def generate_prediction_explanation(prediction_result: Dict[str, Any]) -> str:
	"""Generate human-readable explanation of the prediction"""
	if "error" in prediction_result:
	return f"Error: {prediction_result['error']}"

	rating = prediction_result["predicted_rating"]
	confidence = prediction_result["confidence"]
	author = prediction_result["input_features"]["author"]
	ratings_count = prediction_result["input_features"]["ratings_count"]

	explanation = f"""
	Based on the machine learning model analysis:

	📚 Author: {author}
	📊 Expected Ratings: {format_number(ratings_count)}
	⭐ Predicted Average Rating: {rating}/5.0
	🎯 Confidence Level: {confidence}

	This prediction is based on patterns learned from 990+ books in the Goodreads dataset,
	considering factors like author popularity, expected engagement levels, and historical rating patterns.
	"""

	if confidence == "Low":
	explanation += """

	⚠️ Note: Low confidence predictions may be less reliable.
	This could be due to the author not being in our top authors list or unusual rating patterns.
	"""

	return explanation