goodreads / streamlit_app.py
fguryel's picture
Deploy ML project
ce92e54
"""
Book Popularity Predictor - Streamlit Web Application
=====================================================
A machine learning web application that predicts a book's average rating
based on its characteristics using Random Forest Regression.
Author: Created for ML Portfolio Project
"""
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys
import os
# Add src directory to Python path for imports
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
from src.prediction_utils import (
BookPopularityPredictor,
format_number,
get_rating_color,
get_confidence_color,
generate_prediction_explanation
)
# Page configuration
st.set_page_config(
page_title="Book Popularity Predictor 📚",
page_icon="📚",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
<style>
.main-header {
text-align: center;
padding: 1rem 0;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
border-radius: 10px;
color: white;
margin-bottom: 2rem;
}
.prediction-box {
background: #f8f9fa;
padding: 1.5rem;
border-radius: 10px;
border-left: 4px solid #007bff;
margin: 1rem 0;
}
.metric-box {
background: white;
padding: 1rem;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
margin: 0.5rem 0;
}
.warning-box {
background: #fff3cd;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #ffc107;
margin: 1rem 0;
}
.error-box {
background: #f8d7da;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #dc3545;
margin: 1rem 0;
}
.info-box {
background: #d1ecf1;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #17a2b8;
margin: 1rem 0;
}
</style>
""", unsafe_allow_html=True)
# Initialize the predictor
@st.cache_resource
def load_predictor():
"""Load and cache the prediction model"""
predictor = BookPopularityPredictor(models_dir='models')
if predictor.load_model_components():
return predictor
else:
st.error("❌ Failed to load model components. Please ensure the model is trained.")
return None
def create_rating_visualization(prediction_result):
"""Create visualization for the prediction result"""
if "error" in prediction_result:
return None
predicted_rating = prediction_result["predicted_rating"]
confidence = prediction_result["confidence"]
# Create gauge chart for rating
fig = go.Figure(go.Indicator(
mode = "gauge+number+delta",
value = predicted_rating,
domain = {'x': [0, 1], 'y': [0, 1]},
title = {'text': "Predicted Rating"},
delta = {'reference': 4.1}, # Average rating from dataset
gauge = {
'axis': {'range': [0, 5]},
'bar': {'color': get_rating_color(predicted_rating)},
'steps': [
{'range': [0, 2], 'color': "lightgray"},
{'range': [2, 3], 'color': "orange"},
{'range': [3, 4], 'color': "yellow"},
{'range': [4, 5], 'color': "lightgreen"}
],
'threshold': {
'line': {'color': "red", 'width': 4},
'thickness': 0.75,
'value': 4.5
}
}
))
fig.update_layout(
font={'color': "darkblue", 'family': "Arial"},
height=300
)
return fig
def create_feature_importance_chart():
"""Create a chart showing feature importance"""
features = [
'Author Identity', 'Author Book Count', 'Reviews Count',
'Ratings Count', 'Log Reviews Count', 'Log Ratings Count',
'Rating/Review Ratio'
]
importance = [0.3366, 0.1393, 0.1184, 0.1099, 0.0987, 0.0949, 0.1022]
fig = px.bar(
x=importance,
y=features,
orientation='h',
title="Model Feature Importance",
labels={'x': 'Importance Score', 'y': 'Features'},
color=importance,
color_continuous_scale='Blues'
)
fig.update_layout(
height=400,
showlegend=False
)
return fig
def create_rating_distribution_chart():
"""Create a chart showing rating distribution from the dataset"""
# Simulated data based on actual dataset statistics
ratings = np.random.normal(4.1, 0.233, 1000)
ratings = np.clip(ratings, 3.04, 4.81)
fig = px.histogram(
x=ratings,
nbins=30,
title="Distribution of Book Ratings in Dataset",
labels={'x': 'Average Rating', 'y': 'Number of Books'},
color_discrete_sequence=['#636EFA']
)
fig.add_vline(
x=4.1,
line_dash="dash",
line_color="red",
annotation_text="Dataset Mean: 4.1"
)
fig.update_layout(height=300)
return fig
def main():
"""Main application function"""
# Header
st.markdown("""
<div class="main-header">
<h1>📚 Book Popularity Predictor</h1>
<p>Predict a book's average rating using machine learning</p>
</div>
""", unsafe_allow_html=True)
# Load predictor
predictor = load_predictor()
if predictor is None:
st.error("🚫 Application cannot start without the trained model. Please train the model first.")
st.code("python src/model_training.py", language="bash")
return
# Sidebar for inputs
st.sidebar.header("📝 Book Information")
# Author selection
top_authors = predictor.get_top_authors()
selected_author = st.sidebar.selectbox(
"Select Author",
options=top_authors,
help="Choose from the top 20 most popular authors in our dataset"
)
# Ratings count input
ratings_count = st.sidebar.number_input(
"Expected Number of Ratings",
min_value=1,
max_value=20000000,
value=50000,
step=1000,
help="How many ratings do you expect this book to receive?"
)
# Reviews count input (optional)
use_custom_reviews = st.sidebar.checkbox("Specify reviews count (optional)")
if use_custom_reviews:
reviews_count = st.sidebar.number_input(
"Expected Number of Reviews",
min_value=0,
max_value=int(ratings_count),
value=int(ratings_count // 8),
step=100,
help="Number of written reviews (typically much less than ratings)"
)
else:
reviews_count = None
st.sidebar.info("📊 Reviews count will be estimated automatically based on typical ratios")
# Main content area
col1, col2 = st.columns([2, 1])
with col1:
st.header("🎯 Prediction Results")
# Predict button
if st.button("🔮 Predict Book Rating", type="primary", use_container_width=True):
# Validate inputs
validation = predictor.validate_inputs(selected_author, ratings_count, reviews_count)
# Show errors if any
if not validation["valid"]:
for error in validation["errors"]:
st.markdown(f"""
<div class="error-box">
❌ <strong>Error:</strong> {error}
</div>
""", unsafe_allow_html=True)
# Show warnings if any
if validation["warnings"]:
for warning in validation["warnings"]:
st.markdown(f"""
<div class="warning-box">
⚠️ <strong>Warning:</strong> {warning}
</div>
""", unsafe_allow_html=True)
# Make prediction if inputs are valid
if validation["valid"]:
with st.spinner("🤔 Analyzing book characteristics..."):
prediction_result = predictor.predict_book_rating(
selected_author, ratings_count, reviews_count
)
if "error" in prediction_result:
st.markdown(f"""
<div class="error-box">
❌ <strong>Prediction Error:</strong> {prediction_result['error']}
</div>
""", unsafe_allow_html=True)
else:
# Display prediction results
predicted_rating = prediction_result["predicted_rating"]
confidence = prediction_result["confidence"]
# Main prediction display
st.markdown(f"""
<div class="prediction-box">
<h2 style="color: {get_rating_color(predicted_rating)};">
⭐ Predicted Rating: {predicted_rating}/5.0
</h2>
<h4 style="color: {get_confidence_color(confidence)};">
🎯 Confidence: {confidence}
</h4>
</div>
""", unsafe_allow_html=True)
# Metrics display
col_a, col_b, col_c = st.columns(3)
with col_a:
st.metric(
"📖 Author",
selected_author,
delta=f"Book #{prediction_result['derived_features']['author_book_count']} in dataset"
)
with col_b:
st.metric(
"👥 Expected Ratings",
format_number(ratings_count),
delta="User Input"
)
with col_c:
actual_reviews = prediction_result["input_features"]["reviews_count"]
st.metric(
"💬 Expected Reviews",
format_number(actual_reviews),
delta="Estimated" if prediction_result["input_features"]["estimated_reviews"] else "User Input"
)
# Rating gauge visualization
fig_gauge = create_rating_visualization(prediction_result)
if fig_gauge:
st.plotly_chart(fig_gauge, use_container_width=True)
# Detailed explanation
explanation = generate_prediction_explanation(prediction_result)
st.markdown(f"""
<div class="info-box">
{explanation}
</div>
""", unsafe_allow_html=True)
# Technical details (expandable)
with st.expander("🔧 Technical Details"):
st.write("**Derived Features:**")
derived = prediction_result["derived_features"]
tech_col1, tech_col2 = st.columns(2)
with tech_col1:
st.write(f"- Rating to Review Ratio: {derived['rating_to_review_ratio']}")
st.write(f"- Log Ratings Count: {derived['log_ratings_count']}")
with tech_col2:
st.write(f"- Log Reviews Count: {derived['log_reviews_count']}")
st.write(f"- Author Book Count: {derived['author_book_count']}")
st.write("**Model Information:**")
st.write("- Algorithm: Random Forest Regression")
st.write("- Training Data: 990+ books from Goodreads")
st.write("- Features: 7 engineered features including author encoding")
with col2:
st.header("📊 Model Insights")
# Feature importance chart
fig_importance = create_feature_importance_chart()
st.plotly_chart(fig_importance, use_container_width=True)
# Rating distribution
fig_dist = create_rating_distribution_chart()
st.plotly_chart(fig_dist, use_container_width=True)
# Model stats
st.subheader("🎯 Model Performance")
st.metric("Test R² Score", "0.544", delta="Medium Accuracy")
st.metric("Mean Absolute Error", "0.121", delta="±0.12 rating points")
# Model limitations (transparent reporting)
st.subheader("⚠️ Model Limitations")
st.markdown("""
<div style="background: #fff3cd; padding: 1rem; border-radius: 8px; border-left: 4px solid #ffc107; margin: 1rem 0;">
<h4 style="color: #856404; margin-top: 0;">Educational/Portfolio Project</h4>
<ul style="color: #856404; margin-bottom: 0;">
<li><strong>Overfitting Detected</strong>: Training R² (0.889) >> Test R² (0.544)</li>
<li><strong>Small Dataset</strong>: Only 159 books (filtered for top authors)</li>
<li><strong>Author Dependency</strong>: 34% of prediction based on author identity</li>
<li><strong>CV Variability</strong>: Cross-validation scores show high variance</li>
</ul>
</div>
""", unsafe_allow_html=True)
# Dataset info
st.subheader("📚 Dataset Info")
st.write("- **Books**: 990+")
st.write("- **Authors**: 608 unique")
st.write("- **Rating Range**: 3.04 - 4.81")
st.write("- **Average Rating**: 4.10")
# Top authors info
st.subheader("🌟 Top Authors")
for i, author in enumerate(top_authors[:5]):
st.write(f"{i+1}. {author}")
# Footer
st.markdown("---")
st.markdown("""
<div style="text-align: center; color: gray;">
📚 Book Popularity Predictor | Built with Streamlit & Scikit-learn<br>
🤖 Machine Learning Portfolio Project | Created for educational purposes
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()