| """ |
| Book Popularity Predictor - Streamlit Web Application |
| ===================================================== |
| |
| A machine learning web application that predicts a book's average rating |
| based on its characteristics using Random Forest Regression. |
| |
| Author: Created for ML Portfolio Project |
| """ |
|
|
| import streamlit as st |
| import pandas as pd |
| import numpy as np |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from plotly.subplots import make_subplots |
| import sys |
| import os |
|
|
| |
| sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) |
|
|
| from src.prediction_utils import ( |
| BookPopularityPredictor, |
| format_number, |
| get_rating_color, |
| get_confidence_color, |
| generate_prediction_explanation |
| ) |
|
|
| |
| st.set_page_config( |
| page_title="Book Popularity Predictor 📚", |
| page_icon="📚", |
| layout="wide", |
| initial_sidebar_state="expanded" |
| ) |
|
|
| |
| st.markdown(""" |
| <style> |
| .main-header { |
| text-align: center; |
| padding: 1rem 0; |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); |
| border-radius: 10px; |
| color: white; |
| margin-bottom: 2rem; |
| } |
| |
| .prediction-box { |
| background: #f8f9fa; |
| padding: 1.5rem; |
| border-radius: 10px; |
| border-left: 4px solid #007bff; |
| margin: 1rem 0; |
| } |
| |
| .metric-box { |
| background: white; |
| padding: 1rem; |
| border-radius: 8px; |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
| margin: 0.5rem 0; |
| } |
| |
| .warning-box { |
| background: #fff3cd; |
| padding: 1rem; |
| border-radius: 8px; |
| border-left: 4px solid #ffc107; |
| margin: 1rem 0; |
| } |
| |
| .error-box { |
| background: #f8d7da; |
| padding: 1rem; |
| border-radius: 8px; |
| border-left: 4px solid #dc3545; |
| margin: 1rem 0; |
| } |
| |
| .info-box { |
| background: #d1ecf1; |
| padding: 1rem; |
| border-radius: 8px; |
| border-left: 4px solid #17a2b8; |
| margin: 1rem 0; |
| } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| @st.cache_resource |
| def load_predictor(): |
| """Load and cache the prediction model""" |
| predictor = BookPopularityPredictor(models_dir='models') |
| if predictor.load_model_components(): |
| return predictor |
| else: |
| st.error("❌ Failed to load model components. Please ensure the model is trained.") |
| return None |
|
|
| def create_rating_visualization(prediction_result): |
| """Create visualization for the prediction result""" |
| if "error" in prediction_result: |
| return None |
| |
| predicted_rating = prediction_result["predicted_rating"] |
| confidence = prediction_result["confidence"] |
| |
| |
| fig = go.Figure(go.Indicator( |
| mode = "gauge+number+delta", |
| value = predicted_rating, |
| domain = {'x': [0, 1], 'y': [0, 1]}, |
| title = {'text': "Predicted Rating"}, |
| delta = {'reference': 4.1}, |
| gauge = { |
| 'axis': {'range': [0, 5]}, |
| 'bar': {'color': get_rating_color(predicted_rating)}, |
| 'steps': [ |
| {'range': [0, 2], 'color': "lightgray"}, |
| {'range': [2, 3], 'color': "orange"}, |
| {'range': [3, 4], 'color': "yellow"}, |
| {'range': [4, 5], 'color': "lightgreen"} |
| ], |
| 'threshold': { |
| 'line': {'color': "red", 'width': 4}, |
| 'thickness': 0.75, |
| 'value': 4.5 |
| } |
| } |
| )) |
| |
| fig.update_layout( |
| font={'color': "darkblue", 'family': "Arial"}, |
| height=300 |
| ) |
| |
| return fig |
|
|
| def create_feature_importance_chart(): |
| """Create a chart showing feature importance""" |
| features = [ |
| 'Author Identity', 'Author Book Count', 'Reviews Count', |
| 'Ratings Count', 'Log Reviews Count', 'Log Ratings Count', |
| 'Rating/Review Ratio' |
| ] |
| importance = [0.3366, 0.1393, 0.1184, 0.1099, 0.0987, 0.0949, 0.1022] |
| |
| fig = px.bar( |
| x=importance, |
| y=features, |
| orientation='h', |
| title="Model Feature Importance", |
| labels={'x': 'Importance Score', 'y': 'Features'}, |
| color=importance, |
| color_continuous_scale='Blues' |
| ) |
| |
| fig.update_layout( |
| height=400, |
| showlegend=False |
| ) |
| |
| return fig |
|
|
| def create_rating_distribution_chart(): |
| """Create a chart showing rating distribution from the dataset""" |
| |
| ratings = np.random.normal(4.1, 0.233, 1000) |
| ratings = np.clip(ratings, 3.04, 4.81) |
| |
| fig = px.histogram( |
| x=ratings, |
| nbins=30, |
| title="Distribution of Book Ratings in Dataset", |
| labels={'x': 'Average Rating', 'y': 'Number of Books'}, |
| color_discrete_sequence=['#636EFA'] |
| ) |
| |
| fig.add_vline( |
| x=4.1, |
| line_dash="dash", |
| line_color="red", |
| annotation_text="Dataset Mean: 4.1" |
| ) |
| |
| fig.update_layout(height=300) |
| |
| return fig |
|
|
| def main(): |
| """Main application function""" |
| |
| |
| st.markdown(""" |
| <div class="main-header"> |
| <h1>📚 Book Popularity Predictor</h1> |
| <p>Predict a book's average rating using machine learning</p> |
| </div> |
| """, unsafe_allow_html=True) |
| |
| |
| predictor = load_predictor() |
| |
| if predictor is None: |
| st.error("🚫 Application cannot start without the trained model. Please train the model first.") |
| st.code("python src/model_training.py", language="bash") |
| return |
| |
| |
| st.sidebar.header("📝 Book Information") |
| |
| |
| top_authors = predictor.get_top_authors() |
| selected_author = st.sidebar.selectbox( |
| "Select Author", |
| options=top_authors, |
| help="Choose from the top 20 most popular authors in our dataset" |
| ) |
| |
| |
| ratings_count = st.sidebar.number_input( |
| "Expected Number of Ratings", |
| min_value=1, |
| max_value=20000000, |
| value=50000, |
| step=1000, |
| help="How many ratings do you expect this book to receive?" |
| ) |
| |
| |
| use_custom_reviews = st.sidebar.checkbox("Specify reviews count (optional)") |
| |
| if use_custom_reviews: |
| reviews_count = st.sidebar.number_input( |
| "Expected Number of Reviews", |
| min_value=0, |
| max_value=int(ratings_count), |
| value=int(ratings_count // 8), |
| step=100, |
| help="Number of written reviews (typically much less than ratings)" |
| ) |
| else: |
| reviews_count = None |
| st.sidebar.info("📊 Reviews count will be estimated automatically based on typical ratios") |
| |
| |
| col1, col2 = st.columns([2, 1]) |
| |
| with col1: |
| st.header("🎯 Prediction Results") |
| |
| |
| if st.button("🔮 Predict Book Rating", type="primary", use_container_width=True): |
| |
| |
| validation = predictor.validate_inputs(selected_author, ratings_count, reviews_count) |
| |
| |
| if not validation["valid"]: |
| for error in validation["errors"]: |
| st.markdown(f""" |
| <div class="error-box"> |
| ❌ <strong>Error:</strong> {error} |
| </div> |
| """, unsafe_allow_html=True) |
| |
| |
| if validation["warnings"]: |
| for warning in validation["warnings"]: |
| st.markdown(f""" |
| <div class="warning-box"> |
| ⚠️ <strong>Warning:</strong> {warning} |
| </div> |
| """, unsafe_allow_html=True) |
| |
| |
| if validation["valid"]: |
| with st.spinner("🤔 Analyzing book characteristics..."): |
| prediction_result = predictor.predict_book_rating( |
| selected_author, ratings_count, reviews_count |
| ) |
| |
| if "error" in prediction_result: |
| st.markdown(f""" |
| <div class="error-box"> |
| ❌ <strong>Prediction Error:</strong> {prediction_result['error']} |
| </div> |
| """, unsafe_allow_html=True) |
| else: |
| |
| predicted_rating = prediction_result["predicted_rating"] |
| confidence = prediction_result["confidence"] |
| |
| |
| st.markdown(f""" |
| <div class="prediction-box"> |
| <h2 style="color: {get_rating_color(predicted_rating)};"> |
| ⭐ Predicted Rating: {predicted_rating}/5.0 |
| </h2> |
| <h4 style="color: {get_confidence_color(confidence)};"> |
| 🎯 Confidence: {confidence} |
| </h4> |
| </div> |
| """, unsafe_allow_html=True) |
| |
| |
| col_a, col_b, col_c = st.columns(3) |
| |
| with col_a: |
| st.metric( |
| "📖 Author", |
| selected_author, |
| delta=f"Book #{prediction_result['derived_features']['author_book_count']} in dataset" |
| ) |
| |
| with col_b: |
| st.metric( |
| "👥 Expected Ratings", |
| format_number(ratings_count), |
| delta="User Input" |
| ) |
| |
| with col_c: |
| actual_reviews = prediction_result["input_features"]["reviews_count"] |
| st.metric( |
| "💬 Expected Reviews", |
| format_number(actual_reviews), |
| delta="Estimated" if prediction_result["input_features"]["estimated_reviews"] else "User Input" |
| ) |
| |
| |
| fig_gauge = create_rating_visualization(prediction_result) |
| if fig_gauge: |
| st.plotly_chart(fig_gauge, use_container_width=True) |
| |
| |
| explanation = generate_prediction_explanation(prediction_result) |
| st.markdown(f""" |
| <div class="info-box"> |
| {explanation} |
| </div> |
| """, unsafe_allow_html=True) |
| |
| |
| with st.expander("🔧 Technical Details"): |
| st.write("**Derived Features:**") |
| derived = prediction_result["derived_features"] |
| |
| tech_col1, tech_col2 = st.columns(2) |
| with tech_col1: |
| st.write(f"- Rating to Review Ratio: {derived['rating_to_review_ratio']}") |
| st.write(f"- Log Ratings Count: {derived['log_ratings_count']}") |
| |
| with tech_col2: |
| st.write(f"- Log Reviews Count: {derived['log_reviews_count']}") |
| st.write(f"- Author Book Count: {derived['author_book_count']}") |
| |
| st.write("**Model Information:**") |
| st.write("- Algorithm: Random Forest Regression") |
| st.write("- Training Data: 990+ books from Goodreads") |
| st.write("- Features: 7 engineered features including author encoding") |
| |
| with col2: |
| st.header("📊 Model Insights") |
| |
| |
| fig_importance = create_feature_importance_chart() |
| st.plotly_chart(fig_importance, use_container_width=True) |
| |
| |
| fig_dist = create_rating_distribution_chart() |
| st.plotly_chart(fig_dist, use_container_width=True) |
| |
| |
| st.subheader("🎯 Model Performance") |
| st.metric("Test R² Score", "0.544", delta="Medium Accuracy") |
| st.metric("Mean Absolute Error", "0.121", delta="±0.12 rating points") |
| |
| |
| st.subheader("⚠️ Model Limitations") |
| st.markdown(""" |
| <div style="background: #fff3cd; padding: 1rem; border-radius: 8px; border-left: 4px solid #ffc107; margin: 1rem 0;"> |
| <h4 style="color: #856404; margin-top: 0;">Educational/Portfolio Project</h4> |
| <ul style="color: #856404; margin-bottom: 0;"> |
| <li><strong>Overfitting Detected</strong>: Training R² (0.889) >> Test R² (0.544)</li> |
| <li><strong>Small Dataset</strong>: Only 159 books (filtered for top authors)</li> |
| <li><strong>Author Dependency</strong>: 34% of prediction based on author identity</li> |
| <li><strong>CV Variability</strong>: Cross-validation scores show high variance</li> |
| </ul> |
| </div> |
| """, unsafe_allow_html=True) |
| |
| |
| st.subheader("📚 Dataset Info") |
| st.write("- **Books**: 990+") |
| st.write("- **Authors**: 608 unique") |
| st.write("- **Rating Range**: 3.04 - 4.81") |
| st.write("- **Average Rating**: 4.10") |
| |
| |
| st.subheader("🌟 Top Authors") |
| for i, author in enumerate(top_authors[:5]): |
| st.write(f"{i+1}. {author}") |
| |
| |
| st.markdown("---") |
| st.markdown(""" |
| <div style="text-align: center; color: gray;"> |
| 📚 Book Popularity Predictor | Built with Streamlit & Scikit-learn<br> |
| 🤖 Machine Learning Portfolio Project | Created for educational purposes |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| if __name__ == "__main__": |
| main() |