import streamlit as st import io import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score, ) # ==== LLM Setup with Caching ==== @st.cache_resource(show_spinner=False) # Disable default spinner def get_llm(): """Cached LLM initialization to prevent reloading on every rerun""" from langchain_google_genai import ChatGoogleGenerativeAI from langchain_groq import ChatGroq import os try: return ChatGroq( model="gemma2-9b-it", groq_api_key=os.getenv("GROQ_API_KEY") ) except Exception as e: try: return ChatGoogleGenerativeAI( model="gemini-2.0-flash-lite-preview-02-05", google_api_key=os.getenv("GEMINI_API_KEY") ) except: return None llm_insights = get_llm() # ==== Cached Metric Calculations ==== @st.cache_data(show_spinner=False) # Add to heavy computations def _compute_classification_metrics(y_test, y_pred): """Cached metric computation for classification""" return { 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred, average="weighted", zero_division=0), 'recall': recall_score(y_test, y_pred, average="weighted", zero_division=0), 'f1': f1_score(y_test, y_pred, average="weighted", zero_division=0), 'cm': confusion_matrix(y_test, y_pred) } @st.cache_data def _compute_regression_metrics(y_test, y_pred): """Cached metric computation for regression""" return { 'mae': mean_absolute_error(y_test, y_pred), 'mse': mean_squared_error(y_test, y_pred), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), 'r2': r2_score(y_test, y_pred) } # ==== Cached Visualization Generation ==== @st.cache_data(show_spinner=False) # Add to heavy computations def _plot_confusion_matrix(cm, classes): """Cached confusion matrix plotting""" fig, ax = plt.subplots(figsize=(2, 2), dpi=200) sns.heatmap( cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes, annot_kws={"size": 8}, ) plt.xticks(fontsize=5) plt.yticks(fontsize=5) buf = io.BytesIO() fig.savefig(buf, format="png", bbox_inches="tight", dpi=200) buf.seek(0) return buf # ==== Optimized Insights Generation ==== @st.cache_data(show_spinner=False) # Add to heavy computations def _get_insights_classification(accuracy, precision, recall, f1, cm_shape): """Cached insights generation based on metrics""" if llm_insights is None: return ( f"### Classification Metrics Explained\n\n" f"**Accuracy** ({accuracy:.3f}): Correct predictions ratio\n" f"**Precision** ({precision:.3f}): Positive prediction accuracy\n" f"**Recall** ({recall:.3f}): Actual positives found\n" f"**F1 Score** ({f1:.3f}): Precision-Recall balance\n" f"Confusion Matrix ({cm_shape[0]}x{cm_shape[1]}): Prediction vs Actual distribution" ) try: response = llm_insights.invoke(f""" Briefly explain these classification metrics (accuracy={accuracy:.3f}, precision={precision:.3f}, recall={recall:.3f}, f1={f1:.3f}) and {cm_shape[0]}x{cm_shape[1]} confusion matrix. Use markdown bullet points. """) return response.content.strip() except: return "Could not generate AI insights - showing basic metrics explanation." def display_test_results(trained_model, X_test, y_test, task_type, label_encoder=None): """ Displays test results, including metrics, confusion matrix (if classification), and LLM-based or fallback insights about the metrics. """ # Create a placeholder for the loading message at the top of the page st.markdown("## Test Results") loading_placeholder = st.empty() # Show initial loading message with loading_placeholder.container(): st.info("⏳ Evaluating model performance on test data. This may take a moment for large datasets.") progress_bar = st.progress(0) # Set a flag to track if results have been calculated if "test_results_calculated" not in st.session_state: st.session_state.test_results_calculated = False # Only perform calculations if they haven't been done yet if not st.session_state.test_results_calculated: sampling_message = None MAX_SAMPLES = 5000 # Increased from 50 to 5000 # Update progress - Starting evaluation with loading_placeholder.container(): progress_bar.progress(10) if len(X_test) <= MAX_SAMPLES: # Use all test data X_test_sample = X_test y_test_sample = y_test st.info("🔍 Using all test data for evaluation...") else: # Use sampling for large datasets sampling_message = f"📊 Using {MAX_SAMPLES} samples from the test set for visualization (out of {len(X_test)} total)" st.info("🔍 Sampling test data for evaluation...") # Simple random sampling idx = np.random.choice(len(X_test.index if hasattr(X_test, 'index') else X_test), size=MAX_SAMPLES, replace=False) X_test_sample = X_test.iloc[idx] if hasattr(X_test, 'iloc') else X_test[idx] y_test_sample = y_test.iloc[idx] if hasattr(y_test, 'iloc') else y_test[idx] # Generate predictions with loading_placeholder.container(): progress_bar.progress(30) st.info("🔄 Generating predictions... Please wait") # Add a spinner for visual feedback during prediction with st.spinner("Model working..."): if task_type == "regression": y_pred = trained_model.predict(X_test_sample) elif task_type == "classification": pipeline, enc = trained_model if label_encoder is None else (trained_model, label_encoder) y_pred = pipeline.predict(X_test_sample) # Decode if label_encoder is used if enc: y_pred = enc.inverse_transform(y_pred) y_test_decoded = enc.inverse_transform(y_test_sample) else: y_test_decoded = y_test_sample # Update progress - Computing metrics with loading_placeholder.container(): progress_bar.progress(60) st.info("📊 Computing metrics...") # Compute metrics if task_type == "regression": metrics = _compute_regression_metrics(y_test_sample, y_pred) else: metrics = _compute_classification_metrics(y_test_decoded, y_pred) # Update progress - Preparing visualizations with loading_placeholder.container(): progress_bar.progress(90) st.info("📈 Preparing visualizations...") # For classification, pre-calculate confusion matrix before showing "ready" message if task_type == "classification": # Pre-calculate confusion matrix (this is the slow part) _ = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded)) # Pre-calculate insights (also potentially slow with LLM) _ = _get_insights_classification( metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'], metrics['cm'].shape ) # Update progress - Complete (only after all calculations are done) with loading_placeholder.container(): progress_bar.progress(100) st.success("✅ Test results ready!") # Mark results as calculated st.session_state.test_results_calculated = True # Store results in session state for reuse st.session_state.test_metrics = metrics if task_type == "classification": st.session_state.test_y_pred = y_pred st.session_state.test_y_test = y_test_decoded else: st.session_state.test_y_pred = y_pred st.session_state.test_y_test = y_test_sample # Store sampling message st.session_state.sampling_message = sampling_message # Import time only when needed (moved from global to local scope) import time time.sleep(0.5) # Short delay to show the "Test results ready!" message # Display sampling message if it exists if "sampling_message" in st.session_state and st.session_state.sampling_message: st.info(st.session_state.sampling_message) # Display the results using stored values if task_type == "regression": st.subheader("🔍 Regression Metrics") # Get metrics from session state or use the ones we just calculated if "test_metrics" in st.session_state and st.session_state.test_results_calculated: metrics = st.session_state.test_metrics y_pred = st.session_state.test_y_pred y_test = st.session_state.test_y_test mae, mse, rmse, r2 = metrics['mae'], metrics['mse'], np.sqrt(metrics['mse']), metrics['r2'] col1, col2, col3, col4 = st.columns(4) col1.metric("📉 MAE", f"{mae:.4f}") col2.metric("📊 MSE", f"{mse:.4f}") col3.metric("📈 RMSE", f"{rmse:.4f}") col4.metric("📌 R² Score", f"{r2:.4f}") # Add regression visualization st.subheader("📈 Prediction vs Actual") df_results = pd.DataFrame({ 'Actual': y_test, 'Predicted': y_pred }) fig = px.scatter(df_results, x='Actual', y='Predicted', title='Predicted vs Actual Values', labels={'Actual': 'Actual Values', 'Predicted': 'Predicted Values'}) fig.add_shape(type='line', x0=min(y_test), y0=min(y_test), x1=max(y_test), y1=max(y_test), line=dict(color='red', dash='dash')) st.plotly_chart(fig, use_container_width=True) elif task_type == "classification": st.subheader("🔍 Classification Metrics") # Get metrics from session state or use the ones we just calculated if "test_metrics" in st.session_state and st.session_state.test_results_calculated: metrics = st.session_state.test_metrics y_pred = st.session_state.test_y_pred y_test_decoded = st.session_state.test_y_test accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'] col1, col2, col3, col4 = st.columns(4) col1.metric("✅ Accuracy", f"{accuracy:.4f}") col2.metric("🎯 Precision", f"{precision:.4f}") col3.metric("📢 Recall", f"{recall:.4f}") col4.metric("🔥 F1 Score", f"{f1:.4f}") st.subheader("📊 Confusion Matrix") # Use cached function for confusion matrix visualization buf = _plot_confusion_matrix(metrics['cm'], np.unique(y_test_decoded)) st.image(buf, width=450) # === Additional Insights Section === st.markdown("---") st.markdown("#### Test Insights") accuracy, precision, recall, f1 = metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'] classification_insights = _get_insights_classification(accuracy, precision, recall, f1, metrics['cm'].shape) st.markdown(classification_insights)