Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import json | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc | |
| from sklearn.preprocessing import StandardScaler | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from plotly.subplots import make_subplots | |
| import kagglehub | |
| from pathlib import Path | |
| import time | |
| import os | |
| st.set_page_config(page_title="Pizza Prediction Dashboard", layout="wide", page_icon="π") | |
| # Title | |
| st.title("π Pizza Prediction - Live Training Dashboard") | |
| st.markdown("---") | |
| # Sidebar for model parameters | |
| st.sidebar.header("ποΈ Model Parameters") | |
| n_estimators = st.sidebar.slider("Number of Trees", 10, 200, 100, 10) | |
| max_depth = st.sidebar.slider("Max Depth", 3, 20, 10) | |
| test_size = st.sidebar.slider("Test Size", 0.1, 0.4, 0.2, 0.05) | |
| random_state = st.sidebar.number_input("Random State", value=42) | |
| start_training = st.sidebar.button("π Start Training", type="primary") | |
| # Load data function | |
| def load_data(): | |
| # Try loading from local file first (for easier deployment) | |
| local_path = Path("train.json") | |
| if local_path.exists(): | |
| with st.spinner("π₯ Loading dataset from local file..."): | |
| with open(local_path, 'r') as f: | |
| data = json.load(f) | |
| df = pd.DataFrame(data) | |
| return df | |
| # Fall back to Kaggle download if local file doesn't exist | |
| with st.spinner("π₯ Downloading dataset from Kaggle..."): | |
| dataset_path = kagglehub.dataset_download("kaggle/random-acts-of-pizza") | |
| train_json_path = Path(dataset_path) / "train.json" | |
| with open(train_json_path, 'r') as f: | |
| data = json.load(f) | |
| df = pd.DataFrame(data) | |
| return df | |
| # Load the data | |
| try: | |
| df = load_data() | |
| st.sidebar.success(f"β Data loaded: {df.shape[0]} samples") | |
| except Exception as e: | |
| st.error(f"Error loading data: {e}") | |
| st.stop() | |
| # Feature selection | |
| numeric_features = [ | |
| 'requester_account_age_in_days_at_request', | |
| 'requester_days_since_first_post_on_raop_at_request', | |
| 'requester_number_of_comments_at_request', | |
| 'requester_number_of_comments_in_raop_at_request', | |
| 'requester_number_of_posts_at_request', | |
| 'requester_number_of_posts_on_raop_at_request', | |
| 'requester_number_of_subreddits_at_request', | |
| 'requester_upvotes_minus_downvotes_at_request', | |
| 'requester_upvotes_plus_downvotes_at_request', | |
| 'number_of_upvotes_of_request_at_retrieval', | |
| 'number_of_downvotes_of_request_at_retrieval', | |
| 'request_number_of_comments_at_retrieval' | |
| ] | |
| # Prepare data | |
| X = df[numeric_features].fillna(0) | |
| y = df['requester_received_pizza'].astype(int) | |
| # Display data info | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("π Total Samples", len(df)) | |
| with col2: | |
| st.metric("β Pizza Received", y.sum()) | |
| with col3: | |
| st.metric("β No Pizza", len(y) - y.sum()) | |
| with col4: | |
| st.metric("π Success Rate", f"{y.mean()*100:.1f}%") | |
| st.markdown("---") | |
| if start_training: | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=test_size, random_state=random_state, stratify=y | |
| ) | |
| # Scale features | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| # Create placeholders for live updates | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| metrics_placeholder = st.empty() | |
| with col2: | |
| tree_progress_placeholder = st.empty() | |
| st.markdown("---") | |
| # Training section | |
| st.subheader("π― Model Performance") | |
| perf_col1, perf_col2 = st.columns(2) | |
| with perf_col1: | |
| accuracy_placeholder = st.empty() | |
| report_placeholder = st.empty() | |
| with perf_col2: | |
| confusion_placeholder = st.empty() | |
| st.markdown("---") | |
| viz_col1, viz_col2 = st.columns(2) | |
| with viz_col1: | |
| feature_imp_placeholder = st.empty() | |
| with viz_col2: | |
| roc_placeholder = st.empty() | |
| # Train model with progress tracking | |
| status_text.text("ποΈ Training Random Forest Model...") | |
| model = RandomForestClassifier( | |
| n_estimators=n_estimators, | |
| max_depth=max_depth, | |
| random_state=random_state, | |
| n_jobs=-1, | |
| warm_start=True # Allow incremental training | |
| ) | |
| # Train incrementally to show progress | |
| trees_per_batch = max(5, n_estimators // 20) | |
| current_trees = 0 | |
| for i in range(1, n_estimators + 1, trees_per_batch): | |
| current_trees = min(i + trees_per_batch - 1, n_estimators) | |
| model.n_estimators = current_trees | |
| model.fit(X_train_scaled, y_train) | |
| # Update progress | |
| progress = current_trees / n_estimators | |
| progress_bar.progress(progress) | |
| status_text.text(f"π² Building trees: {current_trees}/{n_estimators}") | |
| # Show intermediate metrics | |
| y_pred_train = model.predict(X_train_scaled) | |
| train_acc = accuracy_score(y_train, y_pred_train) | |
| with tree_progress_placeholder: | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=[current_trees], | |
| y=[train_acc], | |
| mode='markers', | |
| marker=dict(size=10, color='green'), | |
| name='Training Accuracy' | |
| )) | |
| fig.update_layout( | |
| title="Training Progress", | |
| xaxis_title="Number of Trees", | |
| yaxis_title="Training Accuracy", | |
| yaxis_range=[0.5, 1.0], | |
| xaxis_range=[0, n_estimators] | |
| ) | |
| st.plotly_chart(fig, use_container_width=True, key=f"progress_{current_trees}") | |
| time.sleep(0.1) # Small delay for visual effect | |
| # Final predictions | |
| status_text.text("π Generating predictions...") | |
| y_pred = model.predict(X_test_scaled) | |
| y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] | |
| # Calculate metrics | |
| accuracy = accuracy_score(y_test, y_pred) | |
| conf_matrix = confusion_matrix(y_test, y_pred) | |
| class_report = classification_report(y_test, y_pred, target_names=['No Pizza', 'Pizza Received'], output_dict=True) | |
| status_text.text("β Training Complete!") | |
| progress_bar.progress(1.0) | |
| # Display accuracy | |
| with accuracy_placeholder: | |
| st.success(f"### π― Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)") | |
| # Classification report | |
| st.text("Classification Report:") | |
| report_df = pd.DataFrame(class_report).transpose() | |
| st.dataframe(report_df.style.background_gradient(cmap='RdYlGn'), use_container_width=True) | |
| # Confusion matrix heatmap | |
| with confusion_placeholder: | |
| fig_cm = px.imshow( | |
| conf_matrix, | |
| labels=dict(x="Predicted", y="Actual", color="Count"), | |
| x=['No Pizza', 'Pizza Received'], | |
| y=['No Pizza', 'Pizza Received'], | |
| text_auto=True, | |
| color_continuous_scale='Blues' | |
| ) | |
| fig_cm.update_layout(title="Confusion Matrix") | |
| st.plotly_chart(fig_cm, use_container_width=True) | |
| # Feature importance | |
| with feature_imp_placeholder: | |
| feature_importance = pd.DataFrame({ | |
| 'feature': numeric_features, | |
| 'importance': model.feature_importances_ | |
| }).sort_values('importance', ascending=False) | |
| fig_imp = px.bar( | |
| feature_importance.head(10), | |
| x='importance', | |
| y='feature', | |
| orientation='h', | |
| title="Top 10 Most Important Features", | |
| color='importance', | |
| color_continuous_scale='Viridis' | |
| ) | |
| fig_imp.update_layout(yaxis={'categoryorder': 'total ascending'}) | |
| st.plotly_chart(fig_imp, use_container_width=True) | |
| # ROC Curve | |
| with roc_placeholder: | |
| fpr, tpr, _ = roc_curve(y_test, y_pred_proba) | |
| roc_auc = auc(fpr, tpr) | |
| fig_roc = go.Figure() | |
| fig_roc.add_trace(go.Scatter( | |
| x=fpr, y=tpr, | |
| mode='lines', | |
| name=f'ROC Curve (AUC = {roc_auc:.3f})', | |
| line=dict(color='darkorange', width=2) | |
| )) | |
| fig_roc.add_trace(go.Scatter( | |
| x=[0, 1], y=[0, 1], | |
| mode='lines', | |
| name='Random Classifier', | |
| line=dict(color='navy', width=2, dash='dash') | |
| )) | |
| fig_roc.update_layout( | |
| title='ROC Curve', | |
| xaxis_title='False Positive Rate', | |
| yaxis_title='True Positive Rate', | |
| yaxis=dict(scaleanchor="x", scaleratio=1), | |
| xaxis=dict(constrain='domain') | |
| ) | |
| st.plotly_chart(fig_roc, use_container_width=True) | |
| # Success message | |
| st.balloons() | |
| st.success("π Training completed successfully!") | |
| else: | |
| st.info("π Adjust parameters in the sidebar and click 'Start Training' to begin!") | |
| # Show sample data | |
| st.subheader("π Sample Data Preview") | |
| st.dataframe(df[numeric_features + ['requester_received_pizza']].head(10), use_container_width=True) | |
| # Show feature distributions - Pizza-themed pie chart! | |
| st.subheader("π Target Distribution") | |
| fig_dist = go.Figure(data=[go.Pie( | |
| labels=['No Pizza π’', 'Pizza Received π'], | |
| values=[len(y) - y.sum(), y.sum()], | |
| hole=0.3, # Donut chart like a pizza crust | |
| marker=dict( | |
| colors=['#8B4513', '#FFA500'], # Brown crust, Orange cheese | |
| line=dict(color='#D2691E', width=3) # Pizza crust outline | |
| ), | |
| textfont=dict(size=16, color='white'), | |
| pull=[0, 0.1] # Pull out the "Pizza Received" slice | |
| )]) | |
| fig_dist.update_layout( | |
| title={ | |
| 'text': 'π Pizza Request Outcomes π', | |
| 'x': 0.5, | |
| 'xanchor': 'center', | |
| 'font': {'size': 20} | |
| }, | |
| annotations=[ | |
| dict( | |
| text=f'{y.sum()}<br>Pizzas!', | |
| x=0.5, y=0.5, | |
| font_size=20, | |
| showarrow=False | |
| ) | |
| ], | |
| showlegend=True | |
| ) | |
| st.plotly_chart(fig_dist, use_container_width=True) | |