Spaces:
Sleeping
Sleeping
| """ | |
| Streamlit UI for RGB RAG Evaluation Pipeline | |
| Provides interactive interface to run evaluations and visualize results | |
| """ | |
| import streamlit as st | |
| import json | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| from datetime import datetime | |
| from pathlib import Path | |
| import time | |
| import threading | |
| import queue | |
| import os | |
| from io import BytesIO | |
| from src.pipeline import RGBEvaluationPipeline | |
| from src.config import DEFAULT_MODELS, ALL_MODELS | |
| # Page config | |
| st.set_page_config( | |
| page_title="RGB RAG Evaluation", | |
| page_icon="๐", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .metric-card { | |
| background-color: #f0f2f6; | |
| padding: 20px; | |
| border-radius: 10px; | |
| margin: 10px 0; | |
| } | |
| .success-card { | |
| background-color: #d1e7dd; | |
| padding: 20px; | |
| border-radius: 10px; | |
| margin: 10px 0; | |
| } | |
| .header-main { | |
| font-size: 32px; | |
| font-weight: bold; | |
| margin-bottom: 10px; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def load_results_from_file(filepath): | |
| """Load results from JSON file""" | |
| try: | |
| with open(filepath, 'r') as f: | |
| return json.load(f) | |
| except FileNotFoundError: | |
| return None | |
| def save_results_to_file(results, filepath): | |
| """Save results to JSON file""" | |
| filepath.parent.mkdir(parents=True, exist_ok=True) | |
| with open(filepath, 'w') as f: | |
| json.dump(results, f, indent=2) | |
| def format_results_dataframe(results): | |
| """Convert results to DataFrame for display""" | |
| data = [] | |
| for result in results: | |
| row = { | |
| 'Task': result.get('task_type', 'N/A'), | |
| 'Model': result.get('model_name', 'N/A'), | |
| 'Total Samples': result.get('total_samples', 0), | |
| 'Accuracy (%)': round(result.get('accuracy', 0), 2) if 'accuracy' in result else 'N/A', | |
| 'Rejection Rate (%)': round(result.get('rejection_rate', 0), 2) if 'rejection_rate' in result else 'N/A', | |
| 'Error Detection (%)': round(result.get('error_detection_rate', 0), 2) if 'error_detection_rate' in result else 'N/A', | |
| 'Error Correction (%)': round(result.get('error_correction_rate', 0), 2) if 'error_correction_rate' in result else 'N/A', | |
| } | |
| data.append(row) | |
| return pd.DataFrame(data) | |
| def plot_accuracy_by_noise(results_df): | |
| """Plot accuracy across noise ratios""" | |
| noise_data = results_df[results_df['Task'].str.contains('noise_robustness', na=False)].copy() | |
| if noise_data.empty: | |
| return None | |
| # Extract noise percentage from task name | |
| noise_data['Noise %'] = noise_data['Task'].str.extract(r'(\d+)%').astype(int) | |
| fig = px.line( | |
| noise_data, | |
| x='Noise %', | |
| y='Accuracy (%)', | |
| color='Model', | |
| title='Noise Robustness: Accuracy Across Noise Levels', | |
| markers=True, | |
| line_shape='linear' | |
| ) | |
| fig.update_layout( | |
| xaxis_title='Noise Level (%)', | |
| yaxis_title='Accuracy (%)', | |
| hovermode='x unified', | |
| height=400 | |
| ) | |
| return fig | |
| def plot_metric_comparison(results_df, metric_col): | |
| """Plot metric comparison across models and tasks""" | |
| plot_data = results_df[results_df[metric_col].notna()].copy() | |
| if plot_data.empty: | |
| return None | |
| fig = px.bar( | |
| plot_data, | |
| x='Model', | |
| y=metric_col, | |
| color='Task', | |
| barmode='group', | |
| title=f'{metric_col} by Model', | |
| height=400 | |
| ) | |
| fig.update_layout( | |
| xaxis_title='Model', | |
| yaxis_title=metric_col, | |
| hovermode='x' | |
| ) | |
| return fig | |
| def get_api_key(): | |
| """Get API key from Streamlit secrets or environment""" | |
| import os | |
| # Try Streamlit secrets first (for HF Spaces and local) | |
| if "GROQ_API_KEY" in st.secrets: | |
| return st.secrets["GROQ_API_KEY"] | |
| # Try environment variable (for local development) | |
| api_key = os.getenv("GROQ_API_KEY") | |
| if api_key: | |
| return api_key | |
| return None | |
| def run_evaluation_background(selected_models, selected_tasks, max_samples, api_key, result_queue): | |
| """Run evaluation in a background thread""" | |
| try: | |
| pipeline = RGBEvaluationPipeline(models=selected_models) | |
| results = pipeline.run_full_evaluation( | |
| max_samples_per_task=max_samples, | |
| tasks=selected_tasks | |
| ) | |
| # Save results | |
| results_file = Path("results") / f"evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" | |
| results_data = { | |
| "timestamp": datetime.now().isoformat(), | |
| "models": selected_models, | |
| "tasks": selected_tasks, | |
| "max_samples": max_samples, | |
| "results": [ | |
| { | |
| "task_type": r.task_type, | |
| "model_name": r.model_name, | |
| "total_samples": r.total_samples, | |
| "correct": r.correct, | |
| "incorrect": r.incorrect, | |
| "accuracy": r.accuracy, | |
| "rejected": r.rejected, | |
| "rejection_rate": r.rejection_rate, | |
| "error_detection_count": r.errors_detected, | |
| "error_detection_rate": r.error_detection_rate, | |
| "error_correction_count": r.errors_corrected, | |
| "error_correction_rate": r.error_correction_rate, | |
| } | |
| for r in results | |
| ] | |
| } | |
| save_results_to_file(results_data, results_file) | |
| result_queue.put({"status": "completed", "data": results_data, "file": str(results_file)}) | |
| except Exception as e: | |
| result_queue.put({"status": "error", "error": str(e)}) | |
| def generate_pdf_report(results_data): | |
| """Generate a PDF report from evaluation results""" | |
| try: | |
| from reportlab.lib.pagesizes import letter, A4 | |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
| from reportlab.lib.units import inch | |
| from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak | |
| from reportlab.lib import colors | |
| from datetime import datetime | |
| # Create PDF | |
| buffer = BytesIO() | |
| doc = SimpleDocTemplate(buffer, pagesize=letter, topMargin=0.5*inch) | |
| elements = [] | |
| styles = getSampleStyleSheet() | |
| # Title | |
| title_style = ParagraphStyle( | |
| 'CustomTitle', | |
| parent=styles['Heading1'], | |
| fontSize=24, | |
| textColor=colors.HexColor('#1f77b4'), | |
| spaceAfter=12, | |
| alignment=1 | |
| ) | |
| elements.append(Paragraph("RGB RAG Evaluation Report", title_style)) | |
| elements.append(Spacer(1, 0.3*inch)) | |
| # Summary info | |
| info_data = [ | |
| ["Timestamp", results_data.get("timestamp", "N/A")], | |
| ["Models Evaluated", ", ".join(results_data.get("models", []))], | |
| ["Tasks", ", ".join(results_data.get("tasks", []))], | |
| ["Samples per Task", str(results_data.get("max_samples", "N/A"))], | |
| ] | |
| info_table = Table(info_data, colWidths=[2*inch, 4*inch]) | |
| info_table.setStyle(TableStyle([ | |
| ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey), | |
| ('TEXTCOLOR', (0, 0), (-1, -1), colors.black), | |
| ('ALIGN', (0, 0), (-1, -1), 'LEFT'), | |
| ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'), | |
| ('FONTSIZE', (0, 0), (-1, -1), 10), | |
| ('BOTTOMPADDING', (0, 0), (-1, -1), 12), | |
| ('GRID', (0, 0), (-1, -1), 1, colors.grey), | |
| ])) | |
| elements.append(info_table) | |
| elements.append(Spacer(1, 0.3*inch)) | |
| # Results table | |
| elements.append(Paragraph("Detailed Results", styles['Heading2'])) | |
| elements.append(Spacer(1, 0.2*inch)) | |
| results = results_data.get("results", []) | |
| if results: | |
| # Create results table | |
| table_data = [["Task", "Model", "Accuracy (%)", "Rejection Rate (%)", "Error Detection (%)", "Error Correction (%)"]] | |
| for result in results: | |
| table_data.append([ | |
| result.get("task_type", "N/A")[:20], | |
| result.get("model_name", "N/A"), | |
| f"{result.get('accuracy', 0):.2f}", | |
| f"{result.get('rejection_rate', 0):.2f}", | |
| f"{result.get('error_detection_rate', 0):.2f}", | |
| f"{result.get('error_correction_rate', 0):.2f}", | |
| ]) | |
| results_table = Table(table_data, colWidths=[1.5*inch, 1.2*inch, 1*inch, 1.2*inch, 1.2*inch, 1.2*inch]) | |
| results_table.setStyle(TableStyle([ | |
| ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#1f77b4')), | |
| ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
| ('ALIGN', (0, 0), (-1, -1), 'CENTER'), | |
| ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), | |
| ('FONTSIZE', (0, 0), (-1, 0), 9), | |
| ('BOTTOMPADDING', (0, 0), (-1, 0), 12), | |
| ('BACKGROUND', (0, 1), (-1, -1), colors.beige), | |
| ('GRID', (0, 0), (-1, -1), 1, colors.black), | |
| ('FONTSIZE', (0, 1), (-1, -1), 8), | |
| ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#f0f0f0')]), | |
| ])) | |
| elements.append(results_table) | |
| # Build PDF | |
| doc.build(elements) | |
| buffer.seek(0) | |
| return buffer | |
| except ImportError: | |
| st.warning("reportlab not installed. PDF export not available.") | |
| return None | |
| def main(): | |
| # Check API key availability | |
| api_key = get_api_key() | |
| # Sidebar configuration | |
| st.sidebar.markdown("## โ๏ธ Evaluation Configuration") | |
| # Show API key status | |
| if api_key: | |
| st.sidebar.success("โ API Key Configured") | |
| else: | |
| st.sidebar.error("โ GROQ_API_KEY not found") | |
| st.sidebar.info(""" | |
| **For Hugging Face Spaces:** | |
| 1. Go to Space Settings | |
| 2. Add Secret: GROQ_API_KEY | |
| 3. Value: Your Groq API key | |
| **For Local Development:** | |
| 1. Create .streamlit/secrets.toml | |
| 2. Add: GROQ_API_KEY = "your_key" | |
| """) | |
| return | |
| # Model selection | |
| st.sidebar.markdown("### ๐ Models") | |
| selected_models = st.sidebar.multiselect( | |
| "Select Models to Evaluate", | |
| options=ALL_MODELS, | |
| default=DEFAULT_MODELS, | |
| help="Choose which LLM models to evaluate. First 5 are primary, rest are additional." | |
| ) | |
| if not selected_models: | |
| st.sidebar.warning("Please select at least one model") | |
| return | |
| # Task selection | |
| st.sidebar.markdown("### ๐ฏ Tasks") | |
| tasks = [ | |
| "noise_robustness", | |
| "negative_rejection", | |
| "information_integration", | |
| "counterfactual_robustness" | |
| ] | |
| selected_tasks = st.sidebar.multiselect( | |
| "Select Tasks to Evaluate", | |
| options=tasks, | |
| default=tasks, | |
| help="Choose which RAG abilities to evaluate" | |
| ) | |
| if not selected_tasks: | |
| st.sidebar.warning("Please select at least one task") | |
| return | |
| # Sample size | |
| st.sidebar.markdown("### ๐ Sample Size") | |
| max_samples = st.sidebar.select_slider( | |
| "Samples per Task", | |
| options=[1, 5, 10, 20, 50, 100, 300], | |
| value=5, | |
| help="Number of samples to evaluate per task" | |
| ) | |
| # Main content | |
| col1, col2 = st.columns([3, 1]) | |
| with col1: | |
| st.markdown("# ๐ RGB RAG Evaluation Dashboard") | |
| with col2: | |
| st.markdown(f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| st.markdown("---") | |
| # Status information | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Models Selected", len(selected_models)) | |
| with col2: | |
| st.metric("Tasks Selected", len(selected_tasks)) | |
| with col3: | |
| st.metric("Samples per Task", max_samples) | |
| with col4: | |
| st.metric("Total Evaluations", len(selected_models) * len(selected_tasks)) | |
| st.markdown("---") | |
| # Run evaluation button | |
| col1, col2, col3 = st.columns([1, 1, 2]) | |
| with col1: | |
| run_button = st.button("โถ๏ธ Run Evaluation", use_container_width=True, type="primary") | |
| with col2: | |
| clear_button = st.button("๐ Clear Results", use_container_width=True) | |
| # Initialize session state for background evaluation | |
| if "evaluation_running" not in st.session_state: | |
| st.session_state.evaluation_running = False | |
| if "evaluation_thread" not in st.session_state: | |
| st.session_state.evaluation_thread = None | |
| if "result_queue" not in st.session_state: | |
| st.session_state.result_queue = queue.Queue() | |
| if "evaluation_results" not in st.session_state: | |
| st.session_state.evaluation_results = None | |
| # Progress placeholder | |
| progress_placeholder = st.empty() | |
| results_placeholder = st.empty() | |
| status_placeholder = st.empty() | |
| if clear_button: | |
| st.session_state.evaluation_results = None | |
| st.session_state.evaluation_running = False | |
| st.rerun() | |
| if run_button: | |
| # Check for API key | |
| api_key = get_api_key() | |
| if not api_key: | |
| st.error("โ GROQ_API_KEY not found in secrets. Please configure it in HF Space settings or .streamlit/secrets.toml") | |
| return | |
| # Start background evaluation | |
| st.session_state.evaluation_running = True | |
| st.session_state.result_queue = queue.Queue() | |
| # Start evaluation in background thread | |
| thread = threading.Thread( | |
| target=run_evaluation_background, | |
| args=(selected_models, selected_tasks, max_samples, api_key, st.session_state.result_queue), | |
| daemon=True | |
| ) | |
| thread.start() | |
| st.session_state.evaluation_thread = thread | |
| # Check if evaluation is running and display status | |
| if st.session_state.evaluation_running: | |
| status_col = st.container() | |
| with status_col: | |
| with st.spinner("โณ Evaluation running in background..."): | |
| st.info(""" | |
| **Evaluation Status**: Running | |
| - You can close this page or refresh without interrupting the evaluation | |
| - Results will be automatically displayed when complete | |
| - Check back in a few moments for results | |
| """) | |
| # Check for results without causing continuous reruns | |
| try: | |
| result = st.session_state.result_queue.get(timeout=2) | |
| st.session_state.evaluation_running = False | |
| if result["status"] == "completed": | |
| st.session_state.evaluation_results = result["data"] | |
| st.session_state.evaluation_running = False | |
| st.success("โ Evaluation completed! Scroll down to view results.") | |
| time.sleep(1) # Brief pause before displaying results | |
| elif result["status"] == "error": | |
| st.error(f"โ Error during evaluation: {result['error']}") | |
| st.session_state.evaluation_running = False | |
| except queue.Empty: | |
| # Don't rerun, just show status message | |
| st.info("โณ Still evaluating... Please wait or refresh the page in a few moments.") | |
| # Display results if available | |
| if st.session_state.evaluation_results: | |
| results_data = st.session_state.evaluation_results | |
| results_list = results_data.get("results", []) | |
| st.markdown("---") | |
| st.markdown("## ๐ Evaluation Results by Task") | |
| # Convert to DataFrame | |
| df_data = [] | |
| for r in results_list: | |
| df_data.append({ | |
| 'Task': r.get('task_type', 'N/A'), | |
| 'Model': r.get('model_name', 'N/A'), | |
| 'Total Samples': r.get('total_samples', 0), | |
| 'Accuracy (%)': round(r.get('accuracy', 0), 2) if r.get('accuracy') is not None else 'N/A', | |
| 'Rejection Rate (%)': round(r.get('rejection_rate', 0), 2) if r.get('rejection_rate') is not None else 'N/A', | |
| 'Error Detection (%)': round(r.get('error_detection_rate', 0), 2) if r.get('error_detection_rate') is not None else 'N/A', | |
| 'Error Correction (%)': round(r.get('error_correction_rate', 0), 2) if r.get('error_correction_rate') is not None else 'N/A', | |
| }) | |
| results_df = pd.DataFrame(df_data) | |
| # Define task types with their primary metrics | |
| task_configs = { | |
| 'noise_robustness': { | |
| 'title': '๐ Noise Robustness', | |
| 'icon': '๐', | |
| 'primary_metric': 'Accuracy (%)', | |
| 'description': 'Accuracy across different noise levels' | |
| }, | |
| 'negative_rejection': { | |
| 'title': '๐ซ Negative Rejection', | |
| 'icon': 'โ', | |
| 'primary_metric': 'Rejection Rate (%)', | |
| 'description': 'System ability to reject invalid questions' | |
| }, | |
| 'information_integration': { | |
| 'title': '๐ Information Integration', | |
| 'icon': '๐', | |
| 'primary_metric': 'Accuracy (%)', | |
| 'description': 'Multi-document synthesis ability' | |
| }, | |
| 'counterfactual_robustness': { | |
| 'title': 'โก Counterfactual Robustness', | |
| 'icon': '๐', | |
| 'primary_metric': 'Error Detection (%)', | |
| 'description': 'Error detection and correction ability' | |
| } | |
| } | |
| # Display separate grids for each task | |
| for task_key, task_config in task_configs.items(): | |
| task_data = results_df[results_df['Task'].str.contains(task_key, case=False, na=False)] | |
| if not task_data.empty: | |
| st.markdown(f"### {task_config['icon']} {task_config['title']}") | |
| st.markdown(f"*{task_config['description']}*") | |
| # Display grid-like table for this task | |
| cols = st.columns(len(task_data)) | |
| for idx, (col, (_, row)) in enumerate(zip(cols, task_data.iterrows())): | |
| with col: | |
| st.markdown(f"#### {row['Model']}") | |
| st.metric( | |
| "Total Samples", | |
| f"{int(row['Total Samples'])}" | |
| ) | |
| # Show appropriate metrics based on task type | |
| if task_key == 'noise_robustness': | |
| # Extract noise level from task name (e.g., "noise_robustness_20%" -> "20%") | |
| task_name = row['Task'] | |
| noise_level = "N/A" | |
| if '_' in task_name: | |
| parts = task_name.split('_') | |
| if len(parts) >= 3: | |
| noise_level = parts[-1] | |
| st.metric( | |
| "Noise Level", | |
| noise_level | |
| ) | |
| st.metric( | |
| "Accuracy", | |
| f"{row['Accuracy (%)']}%" if row['Accuracy (%)'] != 'N/A' else 'N/A' | |
| ) | |
| elif task_key == 'negative_rejection': | |
| st.metric( | |
| "Rejection Rate", | |
| f"{row['Rejection Rate (%)']}%" if row['Rejection Rate (%)'] != 'N/A' else 'N/A' | |
| ) | |
| elif task_key == 'information_integration': | |
| st.metric( | |
| "Accuracy", | |
| f"{row['Accuracy (%)']}%" if row['Accuracy (%)'] != 'N/A' else 'N/A' | |
| ) | |
| elif task_key == 'counterfactual_robustness': | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric( | |
| "Detection", | |
| f"{row['Error Detection (%)']}%" if row['Error Detection (%)'] != 'N/A' else 'N/A', | |
| label_visibility="collapsed" | |
| ) | |
| with col2: | |
| st.metric( | |
| "Correction", | |
| f"{row['Error Correction (%)']}%" if row['Error Correction (%)'] != 'N/A' else 'N/A', | |
| label_visibility="collapsed" | |
| ) | |
| st.markdown("---") | |
| # Visualizations section | |
| st.markdown("---") | |
| st.markdown("## ๐ Detailed Visualizations & Analytics") | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs([ | |
| "Noise Robustness", | |
| "Rejection Rate", | |
| "Error Detection", | |
| "Error Correction", | |
| "Summary" | |
| ]) | |
| with tab1: | |
| fig_noise = plot_accuracy_by_noise(results_df) | |
| if fig_noise: | |
| st.plotly_chart(fig_noise, use_container_width=True) | |
| else: | |
| st.info("No noise robustness data available") | |
| with tab2: | |
| fig_rejection = plot_metric_comparison(results_df, 'Rejection Rate (%)') | |
| if fig_rejection: | |
| st.plotly_chart(fig_rejection, use_container_width=True) | |
| else: | |
| st.info("No rejection rate data available") | |
| with tab3: | |
| fig_detection = plot_metric_comparison(results_df, 'Error Detection (%)') | |
| if fig_detection: | |
| st.plotly_chart(fig_detection, use_container_width=True) | |
| else: | |
| st.info("No error detection data available") | |
| with tab4: | |
| fig_correction = plot_metric_comparison(results_df, 'Error Correction (%)') | |
| if fig_correction: | |
| st.plotly_chart(fig_correction, use_container_width=True) | |
| else: | |
| st.info("No error correction data available") | |
| with tab5: | |
| # Summary statistics | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("### Model Performance Summary") | |
| if len(results_df) > 0: | |
| model_summary = results_df.groupby('Model')[['Accuracy (%)', 'Rejection Rate (%)', 'Error Detection (%)', 'Error Correction (%)']].mean().round(2) | |
| st.dataframe(model_summary, use_container_width=True) | |
| with col2: | |
| st.markdown("### Task Performance Summary") | |
| if len(results_df) > 0: | |
| task_summary = results_df.groupby('Task')[['Accuracy (%)', 'Rejection Rate (%)', 'Error Detection (%)', 'Error Correction (%)']].mean().round(2) | |
| st.dataframe(task_summary, use_container_width=True) | |
| # Export options | |
| st.markdown("---") | |
| st.markdown("## ๐พ Export Results") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| # Export as CSV | |
| csv = results_df.to_csv(index=False) | |
| st.download_button( | |
| label="๐ฅ CSV Report", | |
| data=csv, | |
| file_name=f"rgb_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", | |
| mime="text/csv", | |
| use_container_width=True | |
| ) | |
| with col2: | |
| # Export as JSON | |
| json_str = json.dumps(results_data, indent=2) | |
| st.download_button( | |
| label="๐ฅ JSON Report", | |
| data=json_str, | |
| file_name=f"rgb_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", | |
| mime="application/json", | |
| use_container_width=True | |
| ) | |
| with col3: | |
| # Export as PDF (if reportlab is available) | |
| try: | |
| pdf_buffer = generate_pdf_report(results_data) | |
| if pdf_buffer: | |
| st.download_button( | |
| label="๐ฅ PDF Report", | |
| data=pdf_buffer, | |
| file_name=f"rgb_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pdf", | |
| mime="application/pdf", | |
| use_container_width=True | |
| ) | |
| except: | |
| st.info("PDF export not available") | |
| # Past Results Section | |
| st.markdown("---") | |
| st.markdown("## ๐ Past Results") | |
| results_dir = Path("results") | |
| if results_dir.exists(): | |
| result_files = sorted(list(results_dir.glob("results_*.json")), reverse=True) | |
| # Debug info in expander | |
| with st.expander("๐ Debug Info"): | |
| st.write(f"**Results directory:** `{results_dir.absolute()}`") | |
| st.write(f"**Files found:** {len(result_files)}") | |
| if result_files: | |
| st.write("**Files:**") | |
| for f in result_files: | |
| st.write(f" - {f.name}") | |
| if result_files: | |
| st.info(f"๐ Found {len(result_files)} previous evaluation(s)") | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| selected_file = st.selectbox( | |
| "๐ Select a past evaluation to view", | |
| options=result_files, | |
| format_func=lambda x: x.stem.replace("evaluation_", ""), | |
| key="past_results_select" | |
| ) | |
| if selected_file: | |
| # Load and display selected result | |
| past_results = load_results_from_file(selected_file) | |
| if past_results: | |
| st.markdown(f"**File:** `{selected_file.name}`") | |
| st.markdown(f"**Timestamp:** {past_results.get('timestamp', 'N/A')}") | |
| st.markdown(f"**Models:** {', '.join(past_results.get('models', []))}") | |
| st.markdown(f"**Tasks:** {', '.join(past_results.get('tasks', []))}") | |
| # Convert to DataFrame | |
| past_df_data = [] | |
| for r in past_results.get("results", []): | |
| past_df_data.append({ | |
| 'Task': r.get('task_type', 'N/A'), | |
| 'Model': r.get('model_name', 'N/A'), | |
| 'Total Samples': r.get('total_samples', 0), | |
| 'Accuracy (%)': round(r.get('accuracy', 0), 2) if r.get('accuracy') is not None else 'N/A', | |
| 'Rejection Rate (%)': round(r.get('rejection_rate', 0), 2) if r.get('rejection_rate') is not None else 'N/A', | |
| 'Error Detection (%)': round(r.get('error_detection_rate', 0), 2) if r.get('error_detection_rate') is not None else 'N/A', | |
| 'Error Correction (%)': round(r.get('error_correction_rate', 0), 2) if r.get('error_correction_rate') is not None else 'N/A', | |
| }) | |
| past_results_df = pd.DataFrame(past_df_data) | |
| # Display grid layout for past results | |
| task_configs = { | |
| 'noise_robustness': { | |
| 'title': '๐ Noise Robustness', | |
| 'icon': '๐', | |
| 'description': 'Accuracy across different noise levels' | |
| }, | |
| 'negative_rejection': { | |
| 'title': '๐ซ Negative Rejection', | |
| 'icon': 'โ', | |
| 'description': 'System ability to reject invalid questions' | |
| }, | |
| 'information_integration': { | |
| 'title': '๐ Information Integration', | |
| 'icon': '๐', | |
| 'description': 'Multi-document synthesis ability' | |
| }, | |
| 'counterfactual_robustness': { | |
| 'title': 'โก Counterfactual Robustness', | |
| 'icon': '๐', | |
| 'description': 'Error detection and correction ability' | |
| } | |
| } | |
| st.markdown("---") | |
| st.markdown("### Results by Task") | |
| # Display separate grids for each task | |
| for task_key, task_config in task_configs.items(): | |
| task_data = past_results_df[past_results_df['Task'].str.contains(task_key, case=False, na=False)] | |
| if not task_data.empty: | |
| st.markdown(f"#### {task_config['icon']} {task_config['title']}") | |
| st.markdown(f"*{task_config['description']}*") | |
| # Display grid-like cards for this task | |
| cols = st.columns(len(task_data)) | |
| for idx, (col, (_, row)) in enumerate(zip(cols, task_data.iterrows())): | |
| with col: | |
| st.markdown(f"**{row['Model']}**") | |
| st.metric( | |
| "Total Samples", | |
| f"{int(row['Total Samples'])}" | |
| ) | |
| # Show appropriate metrics based on task type | |
| if task_key == 'noise_robustness': | |
| # Extract noise level from task name | |
| task_name = row['Task'] | |
| noise_level = "N/A" | |
| if '_' in task_name: | |
| parts = task_name.split('_') | |
| if len(parts) >= 3: | |
| noise_level = parts[-1] | |
| st.metric( | |
| "Noise Level", | |
| noise_level | |
| ) | |
| st.metric( | |
| "Accuracy", | |
| f"{row['Accuracy (%)']}%" if row['Accuracy (%)'] != 'N/A' else 'N/A' | |
| ) | |
| elif task_key == 'negative_rejection': | |
| st.metric( | |
| "Rejection Rate", | |
| f"{row['Rejection Rate (%)']}%" if row['Rejection Rate (%)'] != 'N/A' else 'N/A' | |
| ) | |
| elif task_key == 'information_integration': | |
| st.metric( | |
| "Accuracy", | |
| f"{row['Accuracy (%)']}%" if row['Accuracy (%)'] != 'N/A' else 'N/A' | |
| ) | |
| elif task_key == 'counterfactual_robustness': | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.metric( | |
| "Detection", | |
| f"{row['Error Detection (%)']}%" if row['Error Detection (%)'] != 'N/A' else 'N/A', | |
| label_visibility="collapsed" | |
| ) | |
| with col2: | |
| st.metric( | |
| "Correction", | |
| f"{row['Error Correction (%)']}%" if row['Error Correction (%)'] != 'N/A' else 'N/A', | |
| label_visibility="collapsed" | |
| ) | |
| st.markdown("---") | |
| # Download options for past result | |
| st.markdown("### ๐พ Export Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| csv_data = past_results_df.to_csv(index=False) | |
| st.download_button( | |
| label="๐ฅ Download as CSV", | |
| data=csv_data, | |
| file_name=f"{selected_file.stem}.csv", | |
| mime="text/csv", | |
| use_container_width=True | |
| ) | |
| with col2: | |
| json_data = json.dumps(past_results, indent=2) | |
| st.download_button( | |
| label="๐ฅ Download as JSON", | |
| data=json_data, | |
| file_name=f"{selected_file.stem}.json", | |
| mime="application/json", | |
| use_container_width=True | |
| ) | |
| else: | |
| st.error("Failed to load results file") | |
| else: | |
| st.info("๐ญ No past evaluations found. Run an evaluation first!") | |
| else: | |
| st.warning(f"๐ญ Results directory not found at: {results_dir.absolute()}") | |
| st.info("Run an evaluation to create the results directory.") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style="text-align: center; color: #888;"> | |
| <p>RGB RAG Evaluation Dashboard | Streamlit UI</p> | |
| <p>For more information, see the <a href="https://github.com/chen700564/RGB">RGB Benchmark Repository</a></p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| # Initialize session state | |
| if "evaluation_results" not in st.session_state: | |
| st.session_state.evaluation_results = None | |
| main() | |