Spaces:

fguryel
/

goodreads

Sleeping

App Files Files Community

goodreads / streamlit_app.py

fguryel

Deploy ML project

ce92e54 7 months ago

raw

history blame contribute delete

14.7 kB

	"""
	Book Popularity Predictor - Streamlit Web Application
	=====================================================

	A machine learning web application that predicts a book's average rating
	based on its characteristics using Random Forest Regression.

	Author: Created for ML Portfolio Project
	"""

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import sys
	import os

	# Add src directory to Python path for imports
	sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))

	from src.prediction_utils import (
	BookPopularityPredictor,
	format_number,
	get_rating_color,
	get_confidence_color,
	generate_prediction_explanation
	)

	# Page configuration
	st.set_page_config(
	page_title="Book Popularity Predictor 📚",
	page_icon="📚",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	text-align: center;
	padding: 1rem 0;
	background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
	border-radius: 10px;
	color: white;
	margin-bottom: 2rem;
	}

	.prediction-box {
	background: #f8f9fa;
	padding: 1.5rem;
	border-radius: 10px;
	border-left: 4px solid #007bff;
	margin: 1rem 0;
	}

	.metric-box {
	background: white;
	padding: 1rem;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	margin: 0.5rem 0;
	}

	.warning-box {
	background: #fff3cd;
	padding: 1rem;
	border-radius: 8px;
	border-left: 4px solid #ffc107;
	margin: 1rem 0;
	}

	.error-box {
	background: #f8d7da;
	padding: 1rem;
	border-radius: 8px;
	border-left: 4px solid #dc3545;
	margin: 1rem 0;
	}

	.info-box {
	background: #d1ecf1;
	padding: 1rem;
	border-radius: 8px;
	border-left: 4px solid #17a2b8;
	margin: 1rem 0;
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize the predictor
	@st.cache_resource
	def load_predictor():
	"""Load and cache the prediction model"""
	predictor = BookPopularityPredictor(models_dir='models')
	if predictor.load_model_components():
	return predictor
	else:
	st.error("❌ Failed to load model components. Please ensure the model is trained.")
	return None

	def create_rating_visualization(prediction_result):
	"""Create visualization for the prediction result"""
	if "error" in prediction_result:
	return None

	predicted_rating = prediction_result["predicted_rating"]
	confidence = prediction_result["confidence"]

	# Create gauge chart for rating
	fig = go.Figure(go.Indicator(
	mode = "gauge+number+delta",
	value = predicted_rating,
	domain = {'x': [0, 1], 'y': [0, 1]},
	title = {'text': "Predicted Rating"},
	delta = {'reference': 4.1}, # Average rating from dataset
	gauge = {
	'axis': {'range': [0, 5]},
	'bar': {'color': get_rating_color(predicted_rating)},
	'steps': [
	{'range': [0, 2], 'color': "lightgray"},
	{'range': [2, 3], 'color': "orange"},
	{'range': [3, 4], 'color': "yellow"},
	{'range': [4, 5], 'color': "lightgreen"}
	],
	'threshold': {
	'line': {'color': "red", 'width': 4},
	'thickness': 0.75,
	'value': 4.5
	}
	}
	))

	fig.update_layout(
	font={'color': "darkblue", 'family': "Arial"},
	height=300
	)

	return fig

	def create_feature_importance_chart():
	"""Create a chart showing feature importance"""
	features = [
	'Author Identity', 'Author Book Count', 'Reviews Count',
	'Ratings Count', 'Log Reviews Count', 'Log Ratings Count',
	'Rating/Review Ratio'
	]
	importance = [0.3366, 0.1393, 0.1184, 0.1099, 0.0987, 0.0949, 0.1022]

	fig = px.bar(
	x=importance,
	y=features,
	orientation='h',
	title="Model Feature Importance",
	labels={'x': 'Importance Score', 'y': 'Features'},
	color=importance,
	color_continuous_scale='Blues'
	)

	fig.update_layout(
	height=400,
	showlegend=False
	)

	return fig

	def create_rating_distribution_chart():
	"""Create a chart showing rating distribution from the dataset"""
	# Simulated data based on actual dataset statistics
	ratings = np.random.normal(4.1, 0.233, 1000)
	ratings = np.clip(ratings, 3.04, 4.81)

	fig = px.histogram(
	x=ratings,
	nbins=30,
	title="Distribution of Book Ratings in Dataset",
	labels={'x': 'Average Rating', 'y': 'Number of Books'},
	color_discrete_sequence=['#636EFA']
	)

	fig.add_vline(
	x=4.1,
	line_dash="dash",
	line_color="red",
	annotation_text="Dataset Mean: 4.1"
	)

	fig.update_layout(height=300)

	return fig

	def main():
	"""Main application function"""

	# Header
	st.markdown("""
	<div class="main-header">
	<h1>📚 Book Popularity Predictor</h1>
	<p>Predict a book's average rating using machine learning</p>
	</div>
	""", unsafe_allow_html=True)

	# Load predictor
	predictor = load_predictor()

	if predictor is None:
	st.error("🚫 Application cannot start without the trained model. Please train the model first.")
	st.code("python src/model_training.py", language="bash")
	return

	# Sidebar for inputs
	st.sidebar.header("📝 Book Information")

	# Author selection
	top_authors = predictor.get_top_authors()
	selected_author = st.sidebar.selectbox(
	"Select Author",
	options=top_authors,
	help="Choose from the top 20 most popular authors in our dataset"
	)

	# Ratings count input
	ratings_count = st.sidebar.number_input(
	"Expected Number of Ratings",
	min_value=1,
	max_value=20000000,
	value=50000,
	step=1000,
	help="How many ratings do you expect this book to receive?"
	)

	# Reviews count input (optional)
	use_custom_reviews = st.sidebar.checkbox("Specify reviews count (optional)")

	if use_custom_reviews:
	reviews_count = st.sidebar.number_input(
	"Expected Number of Reviews",
	min_value=0,
	max_value=int(ratings_count),
	value=int(ratings_count // 8),
	step=100,
	help="Number of written reviews (typically much less than ratings)"
	)
	else:
	reviews_count = None
	st.sidebar.info("📊 Reviews count will be estimated automatically based on typical ratios")

	# Main content area
	col1, col2 = st.columns([2, 1])

	with col1:
	st.header("🎯 Prediction Results")

	# Predict button
	if st.button("🔮 Predict Book Rating", type="primary", use_container_width=True):

	# Validate inputs
	validation = predictor.validate_inputs(selected_author, ratings_count, reviews_count)

	# Show errors if any
	if not validation["valid"]:
	for error in validation["errors"]:
	st.markdown(f"""
	<div class="error-box">
	❌ <strong>Error:</strong> {error}
	</div>
	""", unsafe_allow_html=True)

	# Show warnings if any
	if validation["warnings"]:
	for warning in validation["warnings"]:
	st.markdown(f"""
	<div class="warning-box">
	⚠️ <strong>Warning:</strong> {warning}
	</div>
	""", unsafe_allow_html=True)

	# Make prediction if inputs are valid
	if validation["valid"]:
	with st.spinner("🤔 Analyzing book characteristics..."):
	prediction_result = predictor.predict_book_rating(
	selected_author, ratings_count, reviews_count
	)

	if "error" in prediction_result:
	st.markdown(f"""
	<div class="error-box">
	❌ <strong>Prediction Error:</strong> {prediction_result['error']}
	</div>
	""", unsafe_allow_html=True)
	else:
	# Display prediction results
	predicted_rating = prediction_result["predicted_rating"]
	confidence = prediction_result["confidence"]

	# Main prediction display
	st.markdown(f"""
	<div class="prediction-box">
	<h2 style="color: {get_rating_color(predicted_rating)};">
	⭐ Predicted Rating: {predicted_rating}/5.0
	</h2>
	<h4 style="color: {get_confidence_color(confidence)};">
	🎯 Confidence: {confidence}
	</h4>
	</div>
	""", unsafe_allow_html=True)

	# Metrics display
	col_a, col_b, col_c = st.columns(3)

	with col_a:
	st.metric(
	"📖 Author",
	selected_author,
	delta=f"Book #{prediction_result['derived_features']['author_book_count']} in dataset"
	)

	with col_b:
	st.metric(
	"👥 Expected Ratings",
	format_number(ratings_count),
	delta="User Input"
	)

	with col_c:
	actual_reviews = prediction_result["input_features"]["reviews_count"]
	st.metric(
	"💬 Expected Reviews",
	format_number(actual_reviews),
	delta="Estimated" if prediction_result["input_features"]["estimated_reviews"] else "User Input"
	)

	# Rating gauge visualization
	fig_gauge = create_rating_visualization(prediction_result)
	if fig_gauge:
	st.plotly_chart(fig_gauge, use_container_width=True)

	# Detailed explanation
	explanation = generate_prediction_explanation(prediction_result)
	st.markdown(f"""
	<div class="info-box">
	{explanation}
	</div>
	""", unsafe_allow_html=True)

	# Technical details (expandable)
	with st.expander("🔧 Technical Details"):
	st.write("Derived Features:")
	derived = prediction_result["derived_features"]

	tech_col1, tech_col2 = st.columns(2)
	with tech_col1:
	st.write(f"- Rating to Review Ratio: {derived['rating_to_review_ratio']}")
	st.write(f"- Log Ratings Count: {derived['log_ratings_count']}")

	with tech_col2:
	st.write(f"- Log Reviews Count: {derived['log_reviews_count']}")
	st.write(f"- Author Book Count: {derived['author_book_count']}")

	st.write("Model Information:")
	st.write("- Algorithm: Random Forest Regression")
	st.write("- Training Data: 990+ books from Goodreads")
	st.write("- Features: 7 engineered features including author encoding")

	with col2:
	st.header("📊 Model Insights")

	# Feature importance chart
	fig_importance = create_feature_importance_chart()
	st.plotly_chart(fig_importance, use_container_width=True)

	# Rating distribution
	fig_dist = create_rating_distribution_chart()
	st.plotly_chart(fig_dist, use_container_width=True)

	# Model stats
	st.subheader("🎯 Model Performance")
	st.metric("Test R² Score", "0.544", delta="Medium Accuracy")
	st.metric("Mean Absolute Error", "0.121", delta="±0.12 rating points")

	# Model limitations (transparent reporting)
	st.subheader("⚠️ Model Limitations")
	st.markdown("""
	<div style="background: #fff3cd; padding: 1rem; border-radius: 8px; border-left: 4px solid #ffc107; margin: 1rem 0;">
	<h4 style="color: #856404; margin-top: 0;">Educational/Portfolio Project</h4>
	<ul style="color: #856404; margin-bottom: 0;">
	<li><strong>Overfitting Detected</strong>: Training R² (0.889) >> Test R² (0.544)</li>
	<li><strong>Small Dataset</strong>: Only 159 books (filtered for top authors)</li>
	<li><strong>Author Dependency</strong>: 34% of prediction based on author identity</li>
	<li><strong>CV Variability</strong>: Cross-validation scores show high variance</li>
	</ul>
	</div>
	""", unsafe_allow_html=True)

	# Dataset info
	st.subheader("📚 Dataset Info")
	st.write("- Books: 990+")
	st.write("- Authors: 608 unique")
	st.write("- Rating Range: 3.04 - 4.81")
	st.write("- Average Rating: 4.10")

	# Top authors info
	st.subheader("🌟 Top Authors")
	for i, author in enumerate(top_authors[:5]):
	st.write(f"{i+1}. {author}")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; color: gray;">
	📚 Book Popularity Predictor \| Built with Streamlit & Scikit-learn<br>
	🤖 Machine Learning Portfolio Project \| Created for educational purposes
	</div>
	""", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()