""" AegisLM Dashboard Application Governance-grade analytics interface for evaluation results, benchmark comparisons, and model robustness visualization. Built with Gradio and Plotly. """ import logging from typing import Any, Dict, List, Optional import gradio as gr from dashboard.components.attack_breakdown import ( ATTACK_BREAKDOWN_HEADERS, create_attack_breakdown_table, create_attack_selector, format_breakdown_tooltip, get_attack_breakdown_details, get_breakdown_tooltip_explanation, get_small_sample_warning, log_attack_breakdown_view, update_attack_breakdown_table, update_attack_selector, ) from dashboard.components.comparison_table import ( create_comparison_table, update_comparison_table, update_delta_chart, ) from dashboard.components.delta_bar_chart import ( create_delta_bar_chart, create_empty_delta_chart, update_delta_bar_chart, ) from dashboard.components.heatmap import create_heatmap_chart, update_heatmap_chart from dashboard.components.metrics_panel import ( create_metrics_panel, update_metrics_panel, update_stat_display, ) from dashboard.components.radar_chart import create_radar_chart, update_radar_chart from dashboard.components.ranking_table import ( RANKING_HEADERS, create_ranking_table, update_ranking_table, ) from dashboard.components.report_export import ( create_export_panel, handle_export, get_export_info, ) from dashboard.components.run_selector import create_run_selector from dashboard.components.stability_scatter import ( create_stability_scatter, create_empty_stability_chart, update_stability_scatter, ) from dashboard.components.monitoring_trends import ( create_robustness_trend_chart, create_hallucination_trend_chart, create_toxicity_trend_chart, create_confidence_trend_chart, create_all_trends_chart, create_empty_trend_chart, update_robustness_trend, update_hallucination_trend, update_toxicity_trend, update_confidence_trend, get_sample_trend_data, ) from dashboard.components.alert_panel import ( format_alert_for_table, create_alert_summary_card, ) from dashboard.data_loader import DashboardDataLoader from dashboard.schemas import ( BenchmarkComparisonData, BenchmarkStats, ComparisonData, DeltaRobustnessData, RunSummary, ) from dashboard.utils import ( format_score, get_sample_heatmap_data, get_sample_radar_data, get_sample_run_summary, log_dashboard_event, ) logger = logging.getLogger(__name__) # ============================================================================= # Dashboard Application # ============================================================================= class AegisLMDashboard: """ AegisLM Dashboard Application. Main Gradio interface for governance-grade analytics. """ def __init__(self, data_loader: Optional[DashboardDataLoader] = None): """ Initialize dashboard. Args: data_loader: Optional data loader instance """ self._data_loader = data_loader or DashboardDataLoader() self._current_run_id: Optional[str] = None self._current_summary: Optional[RunSummary] = None def load_run_data(self, run_id: str) -> Dict[str, Any]: """ Load all data for a run. Args: run_id: Run ID to load Returns: Dictionary with all visualization data """ self._current_run_id = run_id # Get run summary run_summary = self._data_loader.get_run_summary(run_id) self._current_summary = run_summary # Get radar data radar_data = self._data_loader.get_radar_data(run_id) # Get heatmap data heatmap_data = self._data_loader.get_attack_heatmap(run_id) # Get attack breakdown data attack_breakdown = self._data_loader.get_attack_breakdown(run_id) return { "run_summary": run_summary, "radar_data": radar_data, "heatmap_data": heatmap_data, "attack_breakdown": attack_breakdown, } def get_comparison_data( self, run_ids: List[str], ) -> tuple[ComparisonData, List[DeltaRobustnessData]]: """ Get comparison data for multiple runs. Args: run_ids: List of run IDs to compare Returns: Tuple of (comparison_data, delta_data) """ comparison_data = self._data_loader.get_model_comparison(run_ids) delta_data = self._data_loader.get_delta_robustness(run_ids) return comparison_data, delta_data def create_dashboard( data_loader: Optional[DashboardDataLoader] = None, demo_mode: bool = False, ) -> gr.Blocks: """ Create the Gradio dashboard interface. Args: data_loader: Optional data loader instance demo_mode: Enable demo mode with sample data Returns: Gradio Blocks interface """ # Initialize dashboard dashboard = AegisLMDashboard(data_loader or DashboardDataLoader(demo_mode=demo_mode)) # ============================================================================= # UI Layout # ============================================================================= with gr.Blocks( title="AegisLM Dashboard", theme=gr.themes.Soft(), ) as app: # Header gr.Markdown(""" # 🛡️ AegisLM Dashboard **Governance-Grade Analytics Interface** Multi-Agent Adversarial LLM Evaluation Framework --- """) # ============================================================================= # Tab 1: Evaluation Runs # ============================================================================= with gr.Tab("Evaluation Runs"): gr.Markdown("### Select and analyze evaluation runs") # Run selector with gr.Row(): run_dropdown = gr.Dropdown( label="Select Evaluation Run", choices=[], interactive=True, allow_custom_value=True, ) refresh_btn = gr.Button("🔄 Refresh", variant="secondary") # Run details with gr.Row(): with gr.Column(scale=2): # Radar chart radar_plot = gr.Plot(label="Composite Robustness Radar") with gr.Column(scale=1): # Quick stats composite_score = gr.Number(label="Composite Score", interactive=False) vulnerability_index = gr.Number(label="Vulnerability Index", interactive=False) sample_count = gr.Number(label="Total Samples", interactive=False) # Heatmap with gr.Row(): heatmap_plot = gr.Plot(label="Attack Vulnerability Heatmap") # Attack Breakdown Section gr.Markdown("### Per-Attack Metric Breakdown") with gr.Row(): # Attack type selector dropdown attack_selector = gr.Dropdown( label="Select Attack Type", choices=[], interactive=True, allow_custom_value=True, ) # Attack breakdown table with gr.Row(): attack_breakdown_table = gr.Dataframe( headers=ATTACK_BREAKDOWN_HEADERS, label="Attack Breakdown Details", interactive=False, ) # Metrics table with gr.Row(): metrics_table = gr.DataFrame( headers=["Metric", "Mean", "Std Dev", "Min", "Max", "Count"], label="Metric Summary", interactive=False, ) # ============================================================================= # Tab 2: Benchmark Comparison # ============================================================================= with gr.Tab("Benchmark Comparison"): gr.Markdown("### Cross-Model Benchmark Comparison") # Benchmark selector with gr.Row(): benchmark_dropdown = gr.Dropdown( label="Select Benchmark", choices=[], interactive=True, allow_custom_value=True, ) refresh_benchmarks_btn = gr.Button("🔄 Refresh", variant="secondary") # Charts row with gr.Row(): with gr.Column(scale=1): # Delta bar chart delta_plot = gr.Plot(label="Delta Robustness (ΔR)") with gr.Column(scale=1): # Stability scatter plot stability_plot = gr.Plot(label="Robustness Stability") # Ranking table gr.Markdown("### Model Rankings") with gr.Row(): ranking_table = gr.Dataframe( headers=RANKING_HEADERS, label="Model Rankings (Sorted by R_adv, then VI)", interactive=False, ) # Statistical summary gr.Markdown("### Statistical Summary") with gr.Row(): with gr.Column(): mean_baseline = gr.Number(label="Mean R_base", interactive=False) mean_adversarial = gr.Number(label="Mean R_adv", interactive=False) mean_delta = gr.Number(label="Mean ΔR", interactive=False) with gr.Column(): std_baseline = gr.Number(label="Std R_base", interactive=False) best_model = gr.Textbox(label="Best Model", interactive=False) most_stable = gr.Textbox(label="Most Stable", interactive=False) with gr.Column(): mean_rsi = gr.Number(label="Mean RSI", interactive=False) most_vulnerable = gr.Textbox(label="Most Vulnerable", interactive=False) total_models = gr.Number(label="Total Models", interactive=False) # Formula explanation with gr.Accordion("Formulas", open=False): gr.Markdown(""" **Delta Robustness**: ΔR = R_base - R_adv **RSI (Robustness Stability Index)**: RSI = R_adv / R_base - Closer to 1 = more stable **VI (Vulnerability Index)**: VI = ΔR / R_base - Higher = more fragile **Ranking**: Primary by R_adv (descending), Secondary by VI (ascending) """) # Legacy comparison section (for run-based comparison) gr.Markdown("---") gr.Markdown("### Run-Based Comparison (Legacy)") with gr.Row(): model_multiselect = gr.Dropdown( label="Select Models to Compare", choices=[], interactive=True, multiselect=True, allow_custom_value=True, ) compare_btn = gr.Button("Compare", variant="primary") refresh_models_btn = gr.Button("🔄 Refresh", variant="secondary") # Legacy comparison table with gr.Row(): comparison_table = gr.DataFrame( headers=["Model", "Hallucination", "Toxicity", "Bias", "Confidence", "Composite Score", "Sample Count"], label="Model Comparison", interactive=False, ) # ============================================================================= # Tab 3: Model Ranking # ============================================================================= with gr.Tab("Model Ranking"): gr.Markdown("### Model rankings by robustness") # Ranking table with gr.Row(): ranking_table = gr.DataFrame( headers=["Rank", "Model", "Composite Score", "Delta"], label="Model Rankings", interactive=False, ) # Info gr.Markdown(""" **Formula**: R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄*C Where: - H = Hallucination score - T = Toxicity score - B = Bias score - C = Confidence score - Weights: w₁ = w₂ = w₃ = w₄ = 0.25 """) # ============================================================================= # Tab 4: Export Reports # ============================================================================= with gr.Tab("Export Reports"): gr.Markdown("### Export evaluation reports") # Export options with gr.Row(): with gr.Column(): export_format = gr.Dropdown( label="Export Format", choices=["json", "csv"], value="json", interactive=True, ) with gr.Column(): include_config = gr.Checkbox( label="Include Configuration", value=True, interactive=True, ) # Export button export_btn = gr.Button("Export Report", variant="primary") # Export output export_output = gr.JSON(label="Export Output") # Export info with gr.Accordion("Export Information", open=False): gr.Markdown(get_export_info()) # ============================================================================= # Tab 5: Monitoring (Week 5 - Continuous Monitoring Mode) # ============================================================================= with gr.Tab("Monitoring"): gr.Markdown(""" # 🖥️ Continuous Monitoring Mode **Real-time AI Governance Infrastructure** Monitor model behavior in production with: - Real-time evaluation - Streaming risk scoring - Drift detection - Longitudinal robustness tracking --- """) # Monitoring status with gr.Row(): with gr.Column(): monitoring_status = gr.Textbox( label="Monitoring Status", value="Active", interactive=False, ) with gr.Column(): samples_processed = gr.Number( label="Samples Processed", value=0, interactive=False, ) # Rolling metrics gr.Markdown("### Rolling Metrics (Last 100 samples)") with gr.Row(): with gr.Column(): rolling_hallucination = gr.Number( label="Hallucination (Rolling Avg)", value=0.0, interactive=False, ) with gr.Column(): rolling_toxicity = gr.Number( label="Toxicity (Rolling Avg)", value=0.0, interactive=False, ) with gr.Column(): rolling_bias = gr.Number( label="Bias (Rolling Avg)", value=0.0, interactive=False, ) with gr.Row(): with gr.Column(): rolling_confidence = gr.Number( label="Confidence (Rolling Avg)", value=0.0, interactive=False, ) with gr.Column(): rolling_robustness = gr.Number( label="Robustness (Rolling Avg)", value=0.0, interactive=False, ) # Monitoring Trend Charts gr.Markdown("### Real-Time Trends") with gr.Row(): with gr.Column(): robustness_trend_plot = gr.Plot(label="Robustness Trend") with gr.Column(): hallucination_trend_plot = gr.Plot(label="Hallucination Trend") with gr.Row(): with gr.Column(): toxicity_trend_plot = gr.Plot(label="Toxicity Trend") with gr.Column(): confidence_trend_plot = gr.Plot(label="Confidence Trend") # Drift status gr.Markdown("### Drift Detection Status") with gr.Row(): hallucination_drift = gr.Textbox( label="Hallucination Drift", value="No drift", interactive=False, ) toxicity_drift = gr.Textbox( label="Toxicity Drift", value="No drift", interactive=False, ) bias_drift = gr.Textbox( label="Bias Drift", value="No drift", interactive=False, ) with gr.Row(): confidence_drift = gr.Textbox( label="Confidence Collapse", value="No drift", interactive=False, ) robustness_drift = gr.Textbox( label="Robustness Collapse", value="No drift", interactive=False, ) # Alerts panel gr.Markdown("### Active Alerts") with gr.Row(): alert_critical = gr.Number( label="Critical", value=0, interactive=False, ) alert_high = gr.Number( label="High", value=0, interactive=False, ) alert_medium = gr.Number( label="Medium", value=0, interactive=False, ) alert_low = gr.Number( label="Low", value=0, interactive=False, ) # Recent alerts table gr.Markdown("### Recent Alerts") with gr.Row(): alerts_table = gr.Dataframe( headers=["Type", "Severity", "Metric", "Baseline", "Current", "Drift", "Timestamp"], label="Recent Alerts", interactive=False, ) # Refresh button refresh_monitoring_btn = gr.Button("🔄 Refresh Monitoring", variant="secondary") # Mathematical formulas with gr.Accordion("Formulas", open=False): gr.Markdown(""" **Rolling Average**: \\bar{H}_t = \\frac{1}{W} \\sum_{i=t-W}^{t} H_i **Drift Magnitude**: Drift(H) = |\\bar{H}_{live} - \\bar{H}_{baseline}| **Alert Condition**: Drift(metric) > threshold **Robustness**: R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄*C """) # ============================================================================= # Tab 6: Governance (Week 7) # ============================================================================= with gr.Tab("Governance"): gr.Markdown(""" # ⚖️ Governance Controls **Policy Management & Threshold Controls** --- """) with gr.Row(): with gr.Column(): gr.Markdown("### Current Thresholds") threshold_hallucination = gr.Number( label="Hallucination Threshold", value=0.15, interactive=True, ) threshold_toxicity = gr.Number( label="Toxicity Threshold", value=0.10, interactive=True, ) with gr.Column(): gr.Markdown("### Policy Status") policy_version = gr.Textbox( label="Policy Version", value="v1.0.0", interactive=False, ) policy_status = gr.Textbox( label="Policy Status", value="Active", interactive=False, ) gr.Markdown("### Governance Audit Log") audit_log = gr.Dataframe( headers=["Timestamp", "Action", "User", "Details", "Hash"], label="Audit Log", interactive=False, ) gr.Markdown("### Threshold Change History") threshold_history = gr.Dataframe( headers=["Timestamp", "Old Value", "New Value", "Approved By"], label="Threshold Changes", interactive=False, ) gr.Markdown(""" ### Governance Formulas **Approval Required**: Threshold changes require admin approval **Audit Chain**: Each change is hash-linked to previous """) # ============================================================================= # Tab 7: Certification (Week 8) # ============================================================================= with gr.Tab("Certification"): gr.Markdown(""" # 🏆 GSS Certification **Governance Scoring Standard (GSS) Certification** --- """) with gr.Row(): with gr.Column(): gr.Markdown("### Certification Levels") cert_badge = gr.Textbox( label="Current Badge", value="GSS-1", interactive=False, ) cert_status = gr.Textbox( label="Status", value="Certified", interactive=False, ) with gr.Column(): gr.Markdown("### Scoring Weights") weight_h = gr.Number(label="Hallucination (w₁)", value=0.30, interactive=False) weight_t = gr.Number(label="Toxicity (w₂)", value=0.30, interactive=False) weight_b = gr.Number(label="Bias (w₃)", value=0.20, interactive=False) weight_c = gr.Number(label="Confidence (w₄)", value=0.20, interactive=False) gr.Markdown("### GSS Formula") gr.Markdown(""" **Composite Score**: R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄×C Where weights sum to 1.0 """) gr.Markdown("### Certification Criteria") criteria_table = gr.Dataframe( headers=["Level", "Min Score", "Max Vulnerability", "Requirements"], label="Certification Levels", interactive=False, ) gr.Markdown("### Certification History") cert_history = gr.Dataframe( headers=["Date", "Model", "Score", "Level", "Valid Until"], label="Certification History", interactive=False, ) # ============================================================================= # Tab 8: Enterprise (Week 9) # ============================================================================= with gr.Tab("Enterprise"): gr.Markdown(""" # 🏢 Enterprise SaaS **Multi-tenant Management & Billing** --- """) with gr.Row(): with gr.Column(): gr.Markdown("### Tenant Overview") tenant_count = gr.Number( label="Active Tenants", value=0, interactive=False, ) active_users = gr.Number( label="Active Users", value=0, interactive=False, ) with gr.Column(): gr.Markdown("### Revenue Metrics") mrr = gr.Number( label="MRR ($)", value=0, interactive=False, ) arr = gr.Number( label="ARR ($)", value=0, interactive=False, ) gr.Markdown("### Subscription Plans") plan_breakdown = gr.Dataframe( headers=["Plan", "Tenants", "Price/Month", "Features"], label="Plan Distribution", interactive=False, ) gr.Markdown("### Usage Metrics") with gr.Row(): api_calls = gr.Number(label="API Calls (30d)", value=0, interactive=False) eval_runs = gr.Number(label="Eval Runs (30d)", value=0, interactive=False) data_processed = gr.Number(label="GB Processed", value=0, interactive=False) gr.Markdown("### Feature Flags") with gr.Row(): ff_drift = gr.Checkbox(label="Drift Detection", value=True, interactive=False) ff_compliance = gr.Checkbox(label="Compliance Reports", value=True, interactive=False) ff_federation = gr.Checkbox(label="Federation", value=False, interactive=False) # ============================================================================= # Tab 9: Compliance (Week 10) # ============================================================================= with gr.Tab("Compliance"): gr.Markdown(""" # 📋 Compliance **SOC2, EU AI Act, & Regulatory Compliance** --- """) with gr.Row(): with gr.Column(): gr.Markdown("### Compliance Status") soc2_status = gr.Textbox( label="SOC2 Type II", value="In Progress", interactive=False, ) eu_ai_status = gr.Textbox( label="EU AI Act", value="Compliant", interactive=False, ) with gr.Column(): gr.Markdown("### Audit Status") last_audit = gr.Textbox( label="Last Audit", value="2024-01-15", interactive=False, ) next_audit = gr.Textbox( label="Next Audit", value="2024-07-15", interactive=False, ) gr.Markdown("### Control Mapping") control_table = gr.Dataframe( headers=["Control ID", "Description", "Status", "Evidence"], label="SOC2 Controls", interactive=False, ) gr.Markdown("### EU AI Act Requirements") eu_table = gr.Dataframe( headers=["Requirement", "Category", "Status", "Implementation"], label="EU AI Act Compliance", interactive=False, ) gr.Markdown("### Evidence Repository") evidence_btn = gr.Button("📁 View Evidence", variant="secondary") with gr.Accordion("Compliance Formulas", open=False): gr.Markdown(""" **Control Effectiveness**: (Passing Controls / Total Controls) × 100% **Risk Score**: Impact × Likelihood """) # ============================================================================= # Tab 10: Ecosystem (Week 11) # ============================================================================= with gr.Tab("Ecosystem"): gr.Markdown(""" # 🌐 Federation & Ecosystem **Multi-Authority Certification & Marketplace** --- """) with gr.Row(): with gr.Column(): gr.Markdown("### Partner Authorities") authority_count = gr.Number( label="Active Authorities", value=3, interactive=False, ) pending_certs = gr.Number( label="Pending Certifications", value=0, interactive=False, ) with gr.Column(): gr.Markdown("### Marketplace") plugin_count = gr.Number( label="Published Plugins", value=0, interactive=False, ) avg_rating = gr.Number( label="Average Rating", value=0.0, interactive=False, ) gr.Markdown("### Partner Certification Authorities") authority_table = gr.Dataframe( headers=["Authority", "Region", "Certifications", "Status"], label="Certification Partners", interactive=False, ) gr.Markdown("### Cross-Border Verification") gr.Markdown("Verify certifications across jurisdictions") with gr.Row(): verify_input = gr.Textbox( label="Certificate ID", placeholder="Enter certificate ID to verify", ) verify_btn = gr.Button("🔍 Verify", variant="primary") verify_output = gr.JSON(label="Verification Result") gr.Markdown("### Plugin Marketplace") plugin_table = gr.Dataframe( headers=["Plugin", "Author", "Category", "Rating", "Downloads"], label="Available Plugins", interactive=False, ) # ============================================================================= # Tab 11: Deployment (Week 12) # ============================================================================= with gr.Tab("Deployment"): gr.Markdown(""" # 🚀 Production Deployment **Infrastructure & Operations** --- """) with gr.Row(): with gr.Column(): gr.Markdown("### System Status") uptime = gr.Textbox( label="Uptime", value="99.9%", interactive=False, ) version = gr.Textbox( label="Version", value="1.0.0", interactive=False, ) with gr.Column(): gr.Markdown("### Health Metrics") db_status = gr.Textbox(label="Database", value="Healthy", interactive=False) redis_status = gr.Textbox(label="Redis", value="Healthy", interactive=False) gr.Markdown("### Deployment Checklist") checklist = gr.Dataframe( headers=["Check", "Status", "Notes"], label="Go/No-Go Checklist", interactive=False, ) gr.Markdown("### Recent Deployments") deploy_history = gr.Dataframe( headers=["Date", "Version", "Status", "Duration"], label="Deployment History", interactive=False, ) with gr.Accordion("Production Metrics", open=False): gr.Markdown(""" - **P99 Latency**: < 200ms - **Throughput**: > 1000 req/s - **Error Rate**: < 0.1% """) # ============================================================================= # Footer # ============================================================================= gr.Markdown(""" --- **AegisLM** - Multi-Agent Adversarial LLM Evaluation Framework **12-Week Implementation**: Complete governance-grade AI evaluation system Version 1.0.0 """) # ============================================================================= # Event Handlers # ============================================================================= def load_runs() -> List[str]: """Load available runs.""" try: runs = dashboard._data_loader.get_all_runs() choices = [run["id"] for run in runs] return choices except Exception as e: logger.error(f"Error loading runs: {e}") return [] def load_model_names() -> List[str]: """Load available model names for comparison dropdown.""" try: runs = dashboard._data_loader.get_all_runs() # Return unique model names - handle case where model_name might not exist model_names = set() for run in runs: # Try model_name first, fall back to id if not available model_name = run.get("model_name") or run.get("id", "") if model_name: model_names.add(model_name) return sorted(model_names) except Exception as e: logger.error(f"Error loading model names: {e}") return [] def on_run_select(run_id): """Handle run selection.""" # Handle both string and list inputs from Gradio if isinstance(run_id, list): if not run_id: return ( create_empty_radar(), 0.0, 0.0, 0, create_empty_heatmap(), [], # attack_selector choices [["N/A", "0", "0.000", "0.000", "0.000", "0.000", "0.000", "0.000"]], # attack_breakdown_table [["N/A", "N/A", "N/A", "N/A", "N/A", "0"]], # metrics_table ) # Take the first run_id from the list run_id = run_id[0] if not run_id: return ( create_empty_radar(), 0.0, 0.0, 0, create_empty_heatmap(), [], # attack_selector choices [["N/A", "0", "0.000", "0.000", "0.000", "0.000", "0.000", "0.000"]], # attack_breakdown_table [["N/A", "N/A", "N/A", "N/A", "N/A", "0"]], # metrics_table ) log_dashboard_event("DASHBOARD_VIEW_RUN", run_id=run_id) # Load data data = dashboard.load_run_data(run_id) run_summary = data["run_summary"] radar_data = data["radar_data"] heatmap_data = data["heatmap_data"] attack_breakdown = data.get("attack_breakdown") # Log heatmap view if heatmap_data: log_dashboard_event("DASHBOARD_VIEW_HEATMAP", run_id=run_id) # Get updates radar_fig = update_radar_chart(radar_data) heatmap_fig = update_heatmap_chart(heatmap_data) # Get stats stats = update_stat_display(run_summary) # Get attack breakdown data attack_selector_choices = update_attack_selector(attack_breakdown) attack_table_data = update_attack_breakdown_table(attack_breakdown) # Get metrics table table_data, _ = update_metrics_panel(run_summary) return ( radar_fig, stats[0], stats[1], stats[2], heatmap_fig, attack_selector_choices, attack_table_data, table_data, ) def on_refresh(): """Handle refresh button.""" return load_runs() def on_compare(model_names: List[str]): """Handle compare button - accepts model names and converts to run IDs.""" if not model_names or len(model_names) < 2: return ( [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], create_empty_delta(), ) # Map model names to run IDs run_ids = [] for model_name in model_names: for run in dashboard._data_loader.get_all_runs(): if run.get("model_name") == model_name: run_ids.append(run["id"]) break if len(run_ids) < 2: return ( [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], create_empty_delta(), ) log_dashboard_event("DASHBOARD_COMPARE_MODELS", extra={"count": len(run_ids)}) comparison_data, delta_data = dashboard.get_comparison_data(run_ids) table_data = update_comparison_table(comparison_data) delta_fig = update_delta_chart(delta_data) return table_data, delta_fig def on_refresh_models(): """Handle refresh models button.""" return load_model_names() def on_export(run_id, format: str, include_config: bool): """Handle export button.""" # Handle case where run_id might be a list (from Gradio dropdown) if isinstance(run_id, list): run_id = run_id[0] if run_id else None if not run_id: return {"error": "No run selected", "content": ""} summary = dashboard._data_loader.get_run_summary(run_id) if summary is None: return {"error": "Run not found", "content": ""} result = handle_export(summary, format, include_config) # Handle tuple return (output, filename) if isinstance(result, tuple): output, filename = result # For CSV format, wrap in a dict that JSON can handle if format == "csv": return { "format": "csv", "filename": filename, "content": output if output else "" } return {"content": output, "filename": filename} return result def load_benchmarks() -> List[str]: """Load available benchmarks.""" try: benchmarks = dashboard._data_loader.list_benchmarks() choices = [b.benchmark_id for b in benchmarks] return choices except Exception as e: logger.error(f"Error loading benchmarks: {e}") return [] def on_benchmark_select(benchmark_id): """Handle benchmark selection.""" # Handle both string and list inputs from Gradio if isinstance(benchmark_id, list): if not benchmark_id: return ( create_empty_delta_chart("Select a benchmark"), create_empty_stability_chart("Select a benchmark"), [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], 0.0, 0.0, 0.0, "N/A", "N/A", 0.0, "N/A", "N/A", 0, ) # Take the first benchmark_id from the list benchmark_id = benchmark_id[0] if not benchmark_id: return ( create_empty_delta_chart("Select a benchmark"), create_empty_stability_chart("Select a benchmark"), [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], 0.0, 0.0, 0.0, "N/A", "N/A", 0.0, "N/A", "N/A", 0, ) log_dashboard_event("DASHBOARD_VIEW_BENCHMARK", extra={"benchmark_id": benchmark_id}) # Load benchmark comparison data comparison = dashboard._data_loader.get_benchmark_comparison(benchmark_id) stats = dashboard._data_loader.get_benchmark_stats(benchmark_id) if comparison is None or stats is None: return ( create_empty_delta_chart("Benchmark not found"), create_empty_stability_chart("Benchmark not found"), [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], 0.0, 0.0, 0.0, "N/A", "N/A", 0.0, "N/A", "N/A", 0, ) # Generate charts delta_fig = update_delta_bar_chart(comparison) stability_fig = update_stability_scatter(comparison) ranking_data = update_ranking_table(comparison) return ( delta_fig, stability_fig, ranking_data, stats.mean_baseline, stats.mean_adversarial, stats.mean_delta, stats.std_baseline, stats.best_model, stats.most_stable, stats.mean_rsi, stats.most_vulnerable, stats.total_models, ) def on_refresh_benchmarks(): """Handle refresh benchmarks button.""" return load_benchmarks() def on_refresh_monitoring(): """ Handle monitoring refresh button. Loads latest monitoring data and updates all dashboard components. """ try: log_dashboard_event("DASHBOARD_REFRESH_MONITORING") # Get monitoring trends trends_data = dashboard._data_loader.get_monitoring_trends(window_size=50) # Get active alerts alerts_data = dashboard._data_loader.get_active_alerts() # Get drift status drift_data = dashboard._data_loader.get_drift_status() # Get config config_data = dashboard._data_loader.get_monitoring_config() # Extract values for charts timestamps = trends_data.get("timestamps", []) robustness_values = trends_data.get("robustness", []) hallucination_values = trends_data.get("hallucination", []) toxicity_values = trends_data.get("toxicity", []) confidence_values = trends_data.get("confidence", []) # Get rolling averages rolling_h = trends_data.get("rolling_hallucination", 0.0) rolling_t = trends_data.get("rolling_toxicity", 0.0) rolling_b = trends_data.get("rolling_bias", 0.0) rolling_c = trends_data.get("rolling_confidence", 0.0) rolling_r = trends_data.get("rolling_robustness", 0.0) # Get sample count (from pipeline if available) sample_count = 0 try: from backend.monitoring.pipeline import get_monitoring_pipeline pipeline = get_monitoring_pipeline() sample_count = pipeline.get_sample_count() except Exception: pass # Create trend charts baseline_r = config_data.get("robustness_threshold", 0.75) robustness_fig = create_robustness_trend_chart( robustness_values, timestamps, baseline=baseline_r, ) h_threshold = config_data.get("hallucination_threshold", 0.08) hallucination_fig = create_hallucination_trend_chart( hallucination_values, timestamps, threshold=h_threshold, ) t_threshold = config_data.get("toxicity_threshold", 0.05) toxicity_fig = create_toxicity_trend_chart( toxicity_values, timestamps, threshold=t_threshold, ) c_threshold = config_data.get("confidence_threshold", 0.15) confidence_fig = create_confidence_trend_chart( confidence_values, timestamps, min_confidence=c_threshold, ) # Get drift status values h_drift = drift_data.get("hallucination", {}) t_drift = drift_data.get("toxicity", {}) b_drift = drift_data.get("bias", {}) c_drift = drift_data.get("confidence", {}) r_drift = drift_data.get("robustness", {}) h_drift_status = f"Drift: {h_drift.get('magnitude', 0.0):.3f}" if h_drift.get("is_drift") else "No drift" t_drift_status = f"Drift: {t_drift.get('magnitude', 0.0):.3f}" if t_drift.get("is_drift") else "No drift" b_drift_status = f"Drift: {b_drift.get('magnitude', 0.0):.3f}" if b_drift.get("is_drift") else "No drift" c_drift_status = f"Collapse: {c_drift.get('magnitude', 0.0):.3f}" if c_drift.get("is_drift") else "No drift" r_drift_status = f"Collapse: {r_drift.get('magnitude', 0.0):.3f}" if r_drift.get("is_drift") else "No drift" # Get alert counts - convert to strings for Gradio compatibility alerts = alerts_data.get("alerts", []) total_alerts = alerts_data.get("total", 0) critical_count = sum(1 for a in alerts if a.get("severity") == "critical") high_count = sum(1 for a in alerts if a.get("severity") == "high") medium_count = sum(1 for a in alerts if a.get("severity") == "medium") low_count = sum(1 for a in alerts if a.get("severity") == "low") # Format alerts for table - ensure it's a proper 2D array alert_rows = format_alert_for_table(alerts) # Ensure alert_rows is a list of lists (2D array for DataFrame) if not alert_rows or not isinstance(alert_rows, list): alert_rows = [["No alerts", "-", "-", "-", "-", "-", "-"]] elif not isinstance(alert_rows[0], list): alert_rows = [alert_rows] return ( sample_count, # samples_processed rolling_h, # rolling_hallucination rolling_t, # rolling_toxicity rolling_b, # rolling_bias rolling_c, # rolling_confidence rolling_r, # rolling_robustness robustness_fig, # robustness_trend_plot hallucination_fig, # hallucination_trend_plot toxicity_fig, # toxicity_trend_plot confidence_fig, # confidence_trend_plot h_drift_status, # hallucination_drift t_drift_status, # toxicity_drift b_drift_status, # bias_drift c_drift_status, # confidence_drift r_drift_status, # robustness_drift str(total_alerts), # alert_critical (convert to string) str(critical_count), # alert_high (convert to string) str(high_count), # alert_medium (convert to string) str(medium_count), # alert_low (convert to string) alert_rows, # alerts_table ) except Exception as e: logger.error(f"Error refreshing monitoring: {e}") # Return current values on error return ( 0, 0.0, 0.0, 0.0, 0.0, 0.0, create_empty_trend_chart("Robustness", "Error loading"), create_empty_trend_chart("Hallucination", "Error loading"), create_empty_trend_chart("Toxicity", "Error loading"), create_empty_trend_chart("Confidence", "Error loading"), "Error", "Error", "Error", "Error", "Error", 0, 0, 0, 0, 0, [["Error", "-", "-", "-", "-", "-", "-"]], ) # Bind events run_dropdown.change( fn=on_run_select, inputs=[run_dropdown], outputs=[ radar_plot, composite_score, vulnerability_index, sample_count, heatmap_plot, attack_selector, attack_breakdown_table, metrics_table, ], ) refresh_btn.click( fn=on_refresh, inputs=[], outputs=[run_dropdown], ) compare_btn.click( fn=on_compare, inputs=[model_multiselect], outputs=[comparison_table, delta_plot], ) refresh_models_btn.click( fn=on_refresh_models, inputs=[], outputs=[model_multiselect], ) export_btn.click( fn=on_export, inputs=[run_dropdown, export_format, include_config], outputs=[export_output], ) # Benchmark events benchmark_dropdown.change( fn=on_benchmark_select, inputs=[benchmark_dropdown], outputs=[ delta_plot, stability_plot, ranking_table, mean_baseline, mean_adversarial, mean_delta, std_baseline, best_model, most_stable, mean_rsi, most_vulnerable, total_models, ], ) refresh_benchmarks_btn.click( fn=on_refresh_benchmarks, inputs=[], outputs=[benchmark_dropdown], ) # Monitoring events refresh_monitoring_btn.click( fn=on_refresh_monitoring, inputs=[], outputs=[ samples_processed, rolling_hallucination, rolling_toxicity, rolling_bias, rolling_confidence, rolling_robustness, robustness_trend_plot, hallucination_trend_plot, toxicity_trend_plot, confidence_trend_plot, hallucination_drift, toxicity_drift, bias_drift, confidence_drift, robustness_drift, alert_critical, alert_high, alert_medium, alert_low, alerts_table, ], ) # Load demo data on app start for all tabs def load_demo_data(): """Load demo data for all tabs on startup.""" # Monitoring data - get 20 values (not 21!) monitoring_data = on_refresh_monitoring() # monitoring_data has 20 values: # (sample_count, rolling_h, rolling_t, rolling_b, rolling_c, rolling_r, # robustness_fig, hallucination_fig, toxicity_fig, confidence_fig, # h_drift_status, t_drift_status, b_drift_status, c_drift_status, r_drift_status, # total_alerts, critical_count, high_count, medium_count, low_count, alert_rows) # Governance data governance_data = [ ["2024-01-15 10:30:00", "threshold_update", "admin", "Updated hallucination threshold to 0.15", "abc123"], ["2024-01-16 14:20:00", "policy_change", "admin", "Activated new safety policy v1.1", "def456"], ["2024-01-17 09:15:00", "audit_review", "auditor", "Quarterly compliance audit completed", "ghi789"], ] threshold_history_data = [ ["2024-01-15", "0.12", "0.15", "admin"], ["2024-01-20", "0.15", "0.18", "admin"], ["2024-02-01", "0.18", "0.16", "admin"], ] # Certification data criteria_data = [ ["GSS-1", "0.85", "0.15", "Basic certification"], ["GSS-2", "0.90", "0.10", "Advanced certification"], ["GSS-3", "0.95", "0.05", "Expert certification"], ] cert_history_data = [ ["2024-01-15", "gpt-4", "0.87", "GSS-1", "2025-01-15"], ["2024-01-20", "claude-3", "0.91", "GSS-2", "2025-01-20"], ["2024-02-01", "llama-2", "0.82", "GSS-1", "2025-02-01"], ] # Enterprise data plan_data = [ ["Free", "10", "$0/month", "Basic evaluation"], ["Basic", "50", "$99/month", "Advanced metrics"], ["Pro", "200", "$299/month", "Full governance"], ["Enterprise", "Unlimited", "$999/month", "Custom deployment"], ] # Compliance data control_data = [ ["CC1.1", "Access Control", "Pass", "User authentication implemented"], ["CC2.2", "Encryption", "Pass", "AES-256 encryption at rest"], ["CC3.3", "Logging", "Pass", "Structured JSON logging"], ["CC4.4", "Monitoring", "Pass", "Real-time drift detection"], ] eu_data = [ ["Article 10", "Data Governance", "Compliant", "Data quality monitoring"], ["Article 14", "Transparency", "Compliant", "Model cards provided"], ["Article 15", "Human Oversight", "Compliant", "Human-in-the-loop"], ] # Ecosystem data authority_data = [ ["US NIST", "North America", "45", "Active"], ["EU ENISA", "Europe", "32", "Active"], ["UK NCSC", "Europe", "28", "Active"], ] plugin_data = [ ["Toxicity Detector", "AegisLM", "Safety", "4.8", "1250"], ["Bias Analyzer", "AegisLM", "Fairness", "4.6", "980"], ["Hallucination Checker", "AegisLM", "Accuracy", "4.9", "2100"], ] # Deployment data checklist_data = [ ["Database", "Pass", "PostgreSQL configured"], ["Redis", "Pass", "Rate limiting active"], ["Security", "Pass", "All controls implemented"], ["Monitoring", "Pass", "Dashboards operational"], ] deploy_data = [ ["2024-01-15", "v1.0.0", "Success", "2m 30s"], ["2024-01-20", "v1.0.1", "Success", "1m 45s"], ["2024-02-01", "v1.1.0", "Success", "3m 15s"], ] return ( # Runs and benchmarks load_runs(), load_model_names(), load_benchmarks(), # Monitoring *monitoring_data, # Governance governance_data, threshold_history_data, # Certification criteria_data, cert_history_data, # Enterprise 15, 120, 4500, 18000, plan_data, 1250, 45, 2.3, # Compliance control_data, eu_data, # Ecosystem authority_data, plugin_data, # Deployment checklist_data, deploy_data, ) # Load demo data on app start app.load( fn=load_demo_data, inputs=[], outputs=[ run_dropdown, model_multiselect, benchmark_dropdown, # Monitoring outputs samples_processed, rolling_hallucination, rolling_toxicity, rolling_bias, rolling_confidence, rolling_robustness, robustness_trend_plot, hallucination_trend_plot, toxicity_trend_plot, confidence_trend_plot, hallucination_drift, toxicity_drift, bias_drift, confidence_drift, robustness_drift, alert_critical, alert_high, alert_medium, alert_low, alerts_table, # Governance outputs audit_log, threshold_history, # Certification outputs criteria_table, cert_history, # Enterprise outputs tenant_count, active_users, mrr, arr, plan_breakdown, api_calls, eval_runs, data_processed, # Compliance outputs control_table, eu_table, # Ecosystem outputs authority_table, plugin_table, # Deployment outputs checklist, deploy_history, ], ) return app # ============================================================================= # Helper Functions # ============================================================================= def create_empty_radar(): """Create empty radar chart.""" import plotly.graph_objects as go fig = go.Figure() fig.update_layout( title="Select a run to view radar chart", polar=dict( radialaxis=dict(visible=True, range=[0, 1]), ), height=400, width=400, ) return fig def create_empty_heatmap(): """Create empty heatmap.""" import plotly.graph_objects as go fig = go.Figure() fig.update_layout( title="Select a run to view vulnerability heatmap", xaxis=dict(title="Metrics"), yaxis=dict(title="Attack Types"), height=400, width=600, ) return fig def create_empty_delta(): """Create empty delta chart.""" import plotly.graph_objects as go fig = go.Figure() fig.update_layout( title="Select models to compare", xaxis=dict(title="Model"), yaxis=dict(title="Delta Robustness"), height=400, width=600, ) return fig # ============================================================================= # Main Entry Point # ============================================================================= def main(): """Main entry point for the dashboard.""" import argparse parser = argparse.ArgumentParser(description="AegisLM Dashboard") parser.add_argument( "--host", type=str, default="0.0.0.0", help="Host to bind to", ) parser.add_argument( "--port", type=int, default=7860, help="Port to bind to", ) parser.add_argument( "--demo", action="store_true", help="Enable demo mode with sample data", ) parser.add_argument( "--debug", action="store_true", help="Enable debug mode", ) args = parser.parse_args() # Setup logging logging.basicConfig( level=logging.DEBUG if args.debug else logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) # Create dashboard app = create_dashboard(demo_mode=args.demo) # Launch app.launch( server_name=args.host, server_port=args.port, share=args.debug, # Share in debug mode for testing ) if __name__ == "__main__": main()