| """ |
| AegisLM Dashboard Application |
| |
| Governance-grade analytics interface for evaluation results, |
| benchmark comparisons, and model robustness visualization. |
| |
| Built with Gradio and Plotly. |
| """ |
|
|
| import logging |
| from typing import Any, Dict, List, Optional |
|
|
| import gradio as gr |
|
|
| from dashboard.components.attack_breakdown import ( |
| ATTACK_BREAKDOWN_HEADERS, |
| create_attack_breakdown_table, |
| create_attack_selector, |
| format_breakdown_tooltip, |
| get_attack_breakdown_details, |
| get_breakdown_tooltip_explanation, |
| get_small_sample_warning, |
| log_attack_breakdown_view, |
| update_attack_breakdown_table, |
| update_attack_selector, |
| ) |
| from dashboard.components.comparison_table import ( |
| create_comparison_table, |
| update_comparison_table, |
| update_delta_chart, |
| ) |
| from dashboard.components.delta_bar_chart import ( |
| create_delta_bar_chart, |
| create_empty_delta_chart, |
| update_delta_bar_chart, |
| ) |
| from dashboard.components.heatmap import create_heatmap_chart, update_heatmap_chart |
| from dashboard.components.metrics_panel import ( |
| create_metrics_panel, |
| update_metrics_panel, |
| update_stat_display, |
| ) |
| from dashboard.components.radar_chart import create_radar_chart, update_radar_chart |
| from dashboard.components.ranking_table import ( |
| RANKING_HEADERS, |
| create_ranking_table, |
| update_ranking_table, |
| ) |
| from dashboard.components.report_export import ( |
| create_export_panel, |
| handle_export, |
| get_export_info, |
| ) |
| from dashboard.components.run_selector import create_run_selector |
| from dashboard.components.stability_scatter import ( |
| create_stability_scatter, |
| create_empty_stability_chart, |
| update_stability_scatter, |
| ) |
| from dashboard.components.monitoring_trends import ( |
| create_robustness_trend_chart, |
| create_hallucination_trend_chart, |
| create_toxicity_trend_chart, |
| create_confidence_trend_chart, |
| create_all_trends_chart, |
| create_empty_trend_chart, |
| update_robustness_trend, |
| update_hallucination_trend, |
| update_toxicity_trend, |
| update_confidence_trend, |
| get_sample_trend_data, |
| ) |
| from dashboard.components.alert_panel import ( |
| format_alert_for_table, |
| create_alert_summary_card, |
| ) |
| from dashboard.data_loader import DashboardDataLoader |
| from dashboard.schemas import ( |
| BenchmarkComparisonData, |
| BenchmarkStats, |
| ComparisonData, |
| DeltaRobustnessData, |
| RunSummary, |
| ) |
| from dashboard.utils import ( |
| format_score, |
| get_sample_heatmap_data, |
| get_sample_radar_data, |
| get_sample_run_summary, |
| log_dashboard_event, |
| ) |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| |
| |
|
|
|
|
| class AegisLMDashboard: |
| """ |
| AegisLM Dashboard Application. |
| |
| Main Gradio interface for governance-grade analytics. |
| """ |
|
|
| def __init__(self, data_loader: Optional[DashboardDataLoader] = None): |
| """ |
| Initialize dashboard. |
| |
| Args: |
| data_loader: Optional data loader instance |
| """ |
| self._data_loader = data_loader or DashboardDataLoader() |
| self._current_run_id: Optional[str] = None |
| self._current_summary: Optional[RunSummary] = None |
|
|
| def load_run_data(self, run_id: str) -> Dict[str, Any]: |
| """ |
| Load all data for a run. |
| |
| Args: |
| run_id: Run ID to load |
| |
| Returns: |
| Dictionary with all visualization data |
| """ |
| self._current_run_id = run_id |
| |
| |
| run_summary = self._data_loader.get_run_summary(run_id) |
| self._current_summary = run_summary |
| |
| |
| radar_data = self._data_loader.get_radar_data(run_id) |
| |
| |
| heatmap_data = self._data_loader.get_attack_heatmap(run_id) |
| |
| |
| attack_breakdown = self._data_loader.get_attack_breakdown(run_id) |
| |
| return { |
| "run_summary": run_summary, |
| "radar_data": radar_data, |
| "heatmap_data": heatmap_data, |
| "attack_breakdown": attack_breakdown, |
| } |
|
|
| def get_comparison_data( |
| self, |
| run_ids: List[str], |
| ) -> tuple[ComparisonData, List[DeltaRobustnessData]]: |
| """ |
| Get comparison data for multiple runs. |
| |
| Args: |
| run_ids: List of run IDs to compare |
| |
| Returns: |
| Tuple of (comparison_data, delta_data) |
| """ |
| comparison_data = self._data_loader.get_model_comparison(run_ids) |
| delta_data = self._data_loader.get_delta_robustness(run_ids) |
| |
| return comparison_data, delta_data |
|
|
|
|
| def create_dashboard( |
| data_loader: Optional[DashboardDataLoader] = None, |
| demo_mode: bool = False, |
| ) -> gr.Blocks: |
| """ |
| Create the Gradio dashboard interface. |
| |
| Args: |
| data_loader: Optional data loader instance |
| demo_mode: Enable demo mode with sample data |
| |
| Returns: |
| Gradio Blocks interface |
| """ |
| |
| dashboard = AegisLMDashboard(data_loader or DashboardDataLoader(demo_mode=demo_mode)) |
| |
| |
| |
| |
| |
| with gr.Blocks( |
| title="AegisLM Dashboard", |
| theme=gr.themes.Soft(), |
| ) as app: |
| |
| |
| gr.Markdown(""" |
| # 🛡️ AegisLM Dashboard |
| |
| **Governance-Grade Analytics Interface** |
| |
| Multi-Agent Adversarial LLM Evaluation Framework |
| |
| --- |
| """) |
| |
| |
| |
| |
| |
| with gr.Tab("Evaluation Runs"): |
| gr.Markdown("### Select and analyze evaluation runs") |
| |
| |
| with gr.Row(): |
| run_dropdown = gr.Dropdown( |
| label="Select Evaluation Run", |
| choices=[], |
| interactive=True, |
| allow_custom_value=True, |
| ) |
| refresh_btn = gr.Button("🔄 Refresh", variant="secondary") |
| |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| |
| radar_plot = gr.Plot(label="Composite Robustness Radar") |
| with gr.Column(scale=1): |
| |
| composite_score = gr.Number(label="Composite Score", interactive=False) |
| vulnerability_index = gr.Number(label="Vulnerability Index", interactive=False) |
| sample_count = gr.Number(label="Total Samples", interactive=False) |
| |
| |
| with gr.Row(): |
| heatmap_plot = gr.Plot(label="Attack Vulnerability Heatmap") |
| |
| |
| gr.Markdown("### Per-Attack Metric Breakdown") |
| |
| with gr.Row(): |
| |
| attack_selector = gr.Dropdown( |
| label="Select Attack Type", |
| choices=[], |
| interactive=True, |
| allow_custom_value=True, |
| ) |
| |
| |
| with gr.Row(): |
| attack_breakdown_table = gr.Dataframe( |
| headers=ATTACK_BREAKDOWN_HEADERS, |
| label="Attack Breakdown Details", |
| interactive=False, |
| ) |
| |
| |
| with gr.Row(): |
| metrics_table = gr.DataFrame( |
| headers=["Metric", "Mean", "Std Dev", "Min", "Max", "Count"], |
| label="Metric Summary", |
| interactive=False, |
| ) |
| |
| |
| |
| |
| |
| with gr.Tab("Benchmark Comparison"): |
| gr.Markdown("### Cross-Model Benchmark Comparison") |
| |
| |
| with gr.Row(): |
| benchmark_dropdown = gr.Dropdown( |
| label="Select Benchmark", |
| choices=[], |
| interactive=True, |
| allow_custom_value=True, |
| ) |
| refresh_benchmarks_btn = gr.Button("🔄 Refresh", variant="secondary") |
| |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| |
| delta_plot = gr.Plot(label="Delta Robustness (ΔR)") |
| with gr.Column(scale=1): |
| |
| stability_plot = gr.Plot(label="Robustness Stability") |
| |
| |
| gr.Markdown("### Model Rankings") |
| with gr.Row(): |
| ranking_table = gr.Dataframe( |
| headers=RANKING_HEADERS, |
| label="Model Rankings (Sorted by R_adv, then VI)", |
| interactive=False, |
| ) |
| |
| |
| gr.Markdown("### Statistical Summary") |
| with gr.Row(): |
| with gr.Column(): |
| mean_baseline = gr.Number(label="Mean R_base", interactive=False) |
| mean_adversarial = gr.Number(label="Mean R_adv", interactive=False) |
| mean_delta = gr.Number(label="Mean ΔR", interactive=False) |
| with gr.Column(): |
| std_baseline = gr.Number(label="Std R_base", interactive=False) |
| best_model = gr.Textbox(label="Best Model", interactive=False) |
| most_stable = gr.Textbox(label="Most Stable", interactive=False) |
| with gr.Column(): |
| mean_rsi = gr.Number(label="Mean RSI", interactive=False) |
| most_vulnerable = gr.Textbox(label="Most Vulnerable", interactive=False) |
| total_models = gr.Number(label="Total Models", interactive=False) |
| |
| |
| with gr.Accordion("Formulas", open=False): |
| gr.Markdown(""" |
| **Delta Robustness**: ΔR = R_base - R_adv |
| |
| **RSI (Robustness Stability Index)**: RSI = R_adv / R_base |
| - Closer to 1 = more stable |
| |
| **VI (Vulnerability Index)**: VI = ΔR / R_base |
| - Higher = more fragile |
| |
| **Ranking**: Primary by R_adv (descending), Secondary by VI (ascending) |
| """) |
| |
| |
| gr.Markdown("---") |
| gr.Markdown("### Run-Based Comparison (Legacy)") |
| with gr.Row(): |
| model_multiselect = gr.Dropdown( |
| label="Select Models to Compare", |
| choices=[], |
| interactive=True, |
| multiselect=True, |
| allow_custom_value=True, |
| ) |
| compare_btn = gr.Button("Compare", variant="primary") |
| refresh_models_btn = gr.Button("🔄 Refresh", variant="secondary") |
| |
| |
| with gr.Row(): |
| comparison_table = gr.DataFrame( |
| headers=["Model", "Hallucination", "Toxicity", "Bias", "Confidence", "Composite Score", "Sample Count"], |
| label="Model Comparison", |
| interactive=False, |
| ) |
| |
| |
| |
| |
| |
| with gr.Tab("Model Ranking"): |
| gr.Markdown("### Model rankings by robustness") |
| |
| |
| with gr.Row(): |
| ranking_table = gr.DataFrame( |
| headers=["Rank", "Model", "Composite Score", "Delta"], |
| label="Model Rankings", |
| interactive=False, |
| ) |
| |
| |
| gr.Markdown(""" |
| **Formula**: R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄*C |
| |
| Where: |
| - H = Hallucination score |
| - T = Toxicity score |
| - B = Bias score |
| - C = Confidence score |
| - Weights: w₁ = w₂ = w₃ = w₄ = 0.25 |
| """) |
| |
| |
| |
| |
| |
| with gr.Tab("Export Reports"): |
| gr.Markdown("### Export evaluation reports") |
| |
| |
| with gr.Row(): |
| with gr.Column(): |
| export_format = gr.Dropdown( |
| label="Export Format", |
| choices=["json", "csv"], |
| value="json", |
| interactive=True, |
| ) |
| with gr.Column(): |
| include_config = gr.Checkbox( |
| label="Include Configuration", |
| value=True, |
| interactive=True, |
| ) |
| |
| |
| export_btn = gr.Button("Export Report", variant="primary") |
| |
| |
| export_output = gr.JSON(label="Export Output") |
| |
| |
| with gr.Accordion("Export Information", open=False): |
| gr.Markdown(get_export_info()) |
| |
| |
| |
| |
| |
| with gr.Tab("Monitoring"): |
| gr.Markdown(""" |
| # 🖥️ Continuous Monitoring Mode |
| |
| **Real-time AI Governance Infrastructure** |
| |
| Monitor model behavior in production with: |
| - Real-time evaluation |
| - Streaming risk scoring |
| - Drift detection |
| - Longitudinal robustness tracking |
| |
| --- |
| """) |
| |
| |
| with gr.Row(): |
| with gr.Column(): |
| monitoring_status = gr.Textbox( |
| label="Monitoring Status", |
| value="Active", |
| interactive=False, |
| ) |
| with gr.Column(): |
| samples_processed = gr.Number( |
| label="Samples Processed", |
| value=0, |
| interactive=False, |
| ) |
| |
| |
| gr.Markdown("### Rolling Metrics (Last 100 samples)") |
| |
| with gr.Row(): |
| with gr.Column(): |
| rolling_hallucination = gr.Number( |
| label="Hallucination (Rolling Avg)", |
| value=0.0, |
| interactive=False, |
| ) |
| with gr.Column(): |
| rolling_toxicity = gr.Number( |
| label="Toxicity (Rolling Avg)", |
| value=0.0, |
| interactive=False, |
| ) |
| with gr.Column(): |
| rolling_bias = gr.Number( |
| label="Bias (Rolling Avg)", |
| value=0.0, |
| interactive=False, |
| ) |
| |
| with gr.Row(): |
| with gr.Column(): |
| rolling_confidence = gr.Number( |
| label="Confidence (Rolling Avg)", |
| value=0.0, |
| interactive=False, |
| ) |
| with gr.Column(): |
| rolling_robustness = gr.Number( |
| label="Robustness (Rolling Avg)", |
| value=0.0, |
| interactive=False, |
| ) |
| |
| |
| gr.Markdown("### Real-Time Trends") |
| |
| with gr.Row(): |
| with gr.Column(): |
| robustness_trend_plot = gr.Plot(label="Robustness Trend") |
| with gr.Column(): |
| hallucination_trend_plot = gr.Plot(label="Hallucination Trend") |
| |
| with gr.Row(): |
| with gr.Column(): |
| toxicity_trend_plot = gr.Plot(label="Toxicity Trend") |
| with gr.Column(): |
| confidence_trend_plot = gr.Plot(label="Confidence Trend") |
| |
| |
| gr.Markdown("### Drift Detection Status") |
| |
| with gr.Row(): |
| hallucination_drift = gr.Textbox( |
| label="Hallucination Drift", |
| value="No drift", |
| interactive=False, |
| ) |
| toxicity_drift = gr.Textbox( |
| label="Toxicity Drift", |
| value="No drift", |
| interactive=False, |
| ) |
| bias_drift = gr.Textbox( |
| label="Bias Drift", |
| value="No drift", |
| interactive=False, |
| ) |
| |
| with gr.Row(): |
| confidence_drift = gr.Textbox( |
| label="Confidence Collapse", |
| value="No drift", |
| interactive=False, |
| ) |
| robustness_drift = gr.Textbox( |
| label="Robustness Collapse", |
| value="No drift", |
| interactive=False, |
| ) |
| |
| |
| gr.Markdown("### Active Alerts") |
| |
| with gr.Row(): |
| alert_critical = gr.Number( |
| label="Critical", |
| value=0, |
| interactive=False, |
| ) |
| alert_high = gr.Number( |
| label="High", |
| value=0, |
| interactive=False, |
| ) |
| alert_medium = gr.Number( |
| label="Medium", |
| value=0, |
| interactive=False, |
| ) |
| alert_low = gr.Number( |
| label="Low", |
| value=0, |
| interactive=False, |
| ) |
| |
| |
| gr.Markdown("### Recent Alerts") |
| with gr.Row(): |
| alerts_table = gr.Dataframe( |
| headers=["Type", "Severity", "Metric", "Baseline", "Current", "Drift", "Timestamp"], |
| label="Recent Alerts", |
| interactive=False, |
| ) |
| |
| |
| refresh_monitoring_btn = gr.Button("🔄 Refresh Monitoring", variant="secondary") |
| |
| |
| with gr.Accordion("Formulas", open=False): |
| gr.Markdown(""" |
| **Rolling Average**: |
| \\bar{H}_t = \\frac{1}{W} \\sum_{i=t-W}^{t} H_i |
| |
| **Drift Magnitude**: |
| Drift(H) = |\\bar{H}_{live} - \\bar{H}_{baseline}| |
| |
| **Alert Condition**: |
| Drift(metric) > threshold |
| |
| **Robustness**: |
| R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄*C |
| """) |
| |
| |
| |
| |
| |
| with gr.Tab("Governance"): |
| gr.Markdown(""" |
| # ⚖️ Governance Controls |
| |
| **Policy Management & Threshold Controls** |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Current Thresholds") |
| threshold_hallucination = gr.Number( |
| label="Hallucination Threshold", |
| value=0.15, |
| interactive=True, |
| ) |
| threshold_toxicity = gr.Number( |
| label="Toxicity Threshold", |
| value=0.10, |
| interactive=True, |
| ) |
| with gr.Column(): |
| gr.Markdown("### Policy Status") |
| policy_version = gr.Textbox( |
| label="Policy Version", |
| value="v1.0.0", |
| interactive=False, |
| ) |
| policy_status = gr.Textbox( |
| label="Policy Status", |
| value="Active", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Governance Audit Log") |
| audit_log = gr.Dataframe( |
| headers=["Timestamp", "Action", "User", "Details", "Hash"], |
| label="Audit Log", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Threshold Change History") |
| threshold_history = gr.Dataframe( |
| headers=["Timestamp", "Old Value", "New Value", "Approved By"], |
| label="Threshold Changes", |
| interactive=False, |
| ) |
| |
| gr.Markdown(""" |
| ### Governance Formulas |
| |
| **Approval Required**: Threshold changes require admin approval |
| |
| **Audit Chain**: Each change is hash-linked to previous |
| """) |
| |
| |
| |
| |
| |
| with gr.Tab("Certification"): |
| gr.Markdown(""" |
| # 🏆 GSS Certification |
| |
| **Governance Scoring Standard (GSS) Certification** |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Certification Levels") |
| cert_badge = gr.Textbox( |
| label="Current Badge", |
| value="GSS-1", |
| interactive=False, |
| ) |
| cert_status = gr.Textbox( |
| label="Status", |
| value="Certified", |
| interactive=False, |
| ) |
| with gr.Column(): |
| gr.Markdown("### Scoring Weights") |
| weight_h = gr.Number(label="Hallucination (w₁)", value=0.30, interactive=False) |
| weight_t = gr.Number(label="Toxicity (w₂)", value=0.30, interactive=False) |
| weight_b = gr.Number(label="Bias (w₃)", value=0.20, interactive=False) |
| weight_c = gr.Number(label="Confidence (w₄)", value=0.20, interactive=False) |
| |
| gr.Markdown("### GSS Formula") |
| gr.Markdown(""" |
| **Composite Score**: R = w₁(1-H) + w₂(1-T) + w₃(1-B) + w₄×C |
| |
| Where weights sum to 1.0 |
| """) |
| |
| gr.Markdown("### Certification Criteria") |
| criteria_table = gr.Dataframe( |
| headers=["Level", "Min Score", "Max Vulnerability", "Requirements"], |
| label="Certification Levels", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Certification History") |
| cert_history = gr.Dataframe( |
| headers=["Date", "Model", "Score", "Level", "Valid Until"], |
| label="Certification History", |
| interactive=False, |
| ) |
| |
| |
| |
| |
| |
| with gr.Tab("Enterprise"): |
| gr.Markdown(""" |
| # 🏢 Enterprise SaaS |
| |
| **Multi-tenant Management & Billing** |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Tenant Overview") |
| tenant_count = gr.Number( |
| label="Active Tenants", |
| value=0, |
| interactive=False, |
| ) |
| active_users = gr.Number( |
| label="Active Users", |
| value=0, |
| interactive=False, |
| ) |
| with gr.Column(): |
| gr.Markdown("### Revenue Metrics") |
| mrr = gr.Number( |
| label="MRR ($)", |
| value=0, |
| interactive=False, |
| ) |
| arr = gr.Number( |
| label="ARR ($)", |
| value=0, |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Subscription Plans") |
| plan_breakdown = gr.Dataframe( |
| headers=["Plan", "Tenants", "Price/Month", "Features"], |
| label="Plan Distribution", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Usage Metrics") |
| with gr.Row(): |
| api_calls = gr.Number(label="API Calls (30d)", value=0, interactive=False) |
| eval_runs = gr.Number(label="Eval Runs (30d)", value=0, interactive=False) |
| data_processed = gr.Number(label="GB Processed", value=0, interactive=False) |
| |
| gr.Markdown("### Feature Flags") |
| with gr.Row(): |
| ff_drift = gr.Checkbox(label="Drift Detection", value=True, interactive=False) |
| ff_compliance = gr.Checkbox(label="Compliance Reports", value=True, interactive=False) |
| ff_federation = gr.Checkbox(label="Federation", value=False, interactive=False) |
| |
| |
| |
| |
| |
| with gr.Tab("Compliance"): |
| gr.Markdown(""" |
| # 📋 Compliance |
| |
| **SOC2, EU AI Act, & Regulatory Compliance** |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Compliance Status") |
| soc2_status = gr.Textbox( |
| label="SOC2 Type II", |
| value="In Progress", |
| interactive=False, |
| ) |
| eu_ai_status = gr.Textbox( |
| label="EU AI Act", |
| value="Compliant", |
| interactive=False, |
| ) |
| with gr.Column(): |
| gr.Markdown("### Audit Status") |
| last_audit = gr.Textbox( |
| label="Last Audit", |
| value="2024-01-15", |
| interactive=False, |
| ) |
| next_audit = gr.Textbox( |
| label="Next Audit", |
| value="2024-07-15", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Control Mapping") |
| control_table = gr.Dataframe( |
| headers=["Control ID", "Description", "Status", "Evidence"], |
| label="SOC2 Controls", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### EU AI Act Requirements") |
| eu_table = gr.Dataframe( |
| headers=["Requirement", "Category", "Status", "Implementation"], |
| label="EU AI Act Compliance", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Evidence Repository") |
| evidence_btn = gr.Button("📁 View Evidence", variant="secondary") |
| |
| with gr.Accordion("Compliance Formulas", open=False): |
| gr.Markdown(""" |
| **Control Effectiveness**: (Passing Controls / Total Controls) × 100% |
| |
| **Risk Score**: Impact × Likelihood |
| """) |
| |
| |
| |
| |
| |
| with gr.Tab("Ecosystem"): |
| gr.Markdown(""" |
| # 🌐 Federation & Ecosystem |
| |
| **Multi-Authority Certification & Marketplace** |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### Partner Authorities") |
| authority_count = gr.Number( |
| label="Active Authorities", |
| value=3, |
| interactive=False, |
| ) |
| pending_certs = gr.Number( |
| label="Pending Certifications", |
| value=0, |
| interactive=False, |
| ) |
| with gr.Column(): |
| gr.Markdown("### Marketplace") |
| plugin_count = gr.Number( |
| label="Published Plugins", |
| value=0, |
| interactive=False, |
| ) |
| avg_rating = gr.Number( |
| label="Average Rating", |
| value=0.0, |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Partner Certification Authorities") |
| authority_table = gr.Dataframe( |
| headers=["Authority", "Region", "Certifications", "Status"], |
| label="Certification Partners", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Cross-Border Verification") |
| gr.Markdown("Verify certifications across jurisdictions") |
| |
| with gr.Row(): |
| verify_input = gr.Textbox( |
| label="Certificate ID", |
| placeholder="Enter certificate ID to verify", |
| ) |
| verify_btn = gr.Button("🔍 Verify", variant="primary") |
| |
| verify_output = gr.JSON(label="Verification Result") |
| |
| gr.Markdown("### Plugin Marketplace") |
| plugin_table = gr.Dataframe( |
| headers=["Plugin", "Author", "Category", "Rating", "Downloads"], |
| label="Available Plugins", |
| interactive=False, |
| ) |
| |
| |
| |
| |
| |
| with gr.Tab("Deployment"): |
| gr.Markdown(""" |
| # 🚀 Production Deployment |
| |
| **Infrastructure & Operations** |
| |
| --- |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("### System Status") |
| uptime = gr.Textbox( |
| label="Uptime", |
| value="99.9%", |
| interactive=False, |
| ) |
| version = gr.Textbox( |
| label="Version", |
| value="1.0.0", |
| interactive=False, |
| ) |
| with gr.Column(): |
| gr.Markdown("### Health Metrics") |
| db_status = gr.Textbox(label="Database", value="Healthy", interactive=False) |
| redis_status = gr.Textbox(label="Redis", value="Healthy", interactive=False) |
| |
| gr.Markdown("### Deployment Checklist") |
| checklist = gr.Dataframe( |
| headers=["Check", "Status", "Notes"], |
| label="Go/No-Go Checklist", |
| interactive=False, |
| ) |
| |
| gr.Markdown("### Recent Deployments") |
| deploy_history = gr.Dataframe( |
| headers=["Date", "Version", "Status", "Duration"], |
| label="Deployment History", |
| interactive=False, |
| ) |
| |
| with gr.Accordion("Production Metrics", open=False): |
| gr.Markdown(""" |
| - **P99 Latency**: < 200ms |
| - **Throughput**: > 1000 req/s |
| - **Error Rate**: < 0.1% |
| """) |
| |
| |
| |
| |
| |
| gr.Markdown(""" |
| --- |
| |
| **AegisLM** - Multi-Agent Adversarial LLM Evaluation Framework |
| |
| **12-Week Implementation**: Complete governance-grade AI evaluation system |
| |
| Version 1.0.0 |
| """) |
| |
| |
| |
| |
| |
| def load_runs() -> List[str]: |
| """Load available runs.""" |
| try: |
| runs = dashboard._data_loader.get_all_runs() |
| choices = [run["id"] for run in runs] |
| return choices |
| except Exception as e: |
| logger.error(f"Error loading runs: {e}") |
| return [] |
| |
| def load_model_names() -> List[str]: |
| """Load available model names for comparison dropdown.""" |
| try: |
| runs = dashboard._data_loader.get_all_runs() |
| |
| model_names = set() |
| for run in runs: |
| |
| model_name = run.get("model_name") or run.get("id", "") |
| if model_name: |
| model_names.add(model_name) |
| return sorted(model_names) |
| except Exception as e: |
| logger.error(f"Error loading model names: {e}") |
| return [] |
| |
| def on_run_select(run_id): |
| """Handle run selection.""" |
| |
| if isinstance(run_id, list): |
| if not run_id: |
| return ( |
| create_empty_radar(), |
| 0.0, 0.0, 0, |
| create_empty_heatmap(), |
| [], |
| [["N/A", "0", "0.000", "0.000", "0.000", "0.000", "0.000", "0.000"]], |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| ) |
| |
| run_id = run_id[0] |
| |
| if not run_id: |
| return ( |
| create_empty_radar(), |
| 0.0, 0.0, 0, |
| create_empty_heatmap(), |
| [], |
| [["N/A", "0", "0.000", "0.000", "0.000", "0.000", "0.000", "0.000"]], |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| ) |
| |
| log_dashboard_event("DASHBOARD_VIEW_RUN", run_id=run_id) |
| |
| |
| data = dashboard.load_run_data(run_id) |
| |
| run_summary = data["run_summary"] |
| radar_data = data["radar_data"] |
| heatmap_data = data["heatmap_data"] |
| attack_breakdown = data.get("attack_breakdown") |
| |
| |
| if heatmap_data: |
| log_dashboard_event("DASHBOARD_VIEW_HEATMAP", run_id=run_id) |
| |
| |
| radar_fig = update_radar_chart(radar_data) |
| heatmap_fig = update_heatmap_chart(heatmap_data) |
| |
| |
| stats = update_stat_display(run_summary) |
| |
| |
| attack_selector_choices = update_attack_selector(attack_breakdown) |
| attack_table_data = update_attack_breakdown_table(attack_breakdown) |
| |
| |
| table_data, _ = update_metrics_panel(run_summary) |
| |
| return ( |
| radar_fig, |
| stats[0], stats[1], stats[2], |
| heatmap_fig, |
| attack_selector_choices, |
| attack_table_data, |
| table_data, |
| ) |
| |
| def on_refresh(): |
| """Handle refresh button.""" |
| return load_runs() |
| |
| def on_compare(model_names: List[str]): |
| """Handle compare button - accepts model names and converts to run IDs.""" |
| if not model_names or len(model_names) < 2: |
| return ( |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| create_empty_delta(), |
| ) |
| |
| |
| run_ids = [] |
| for model_name in model_names: |
| for run in dashboard._data_loader.get_all_runs(): |
| if run.get("model_name") == model_name: |
| run_ids.append(run["id"]) |
| break |
| |
| if len(run_ids) < 2: |
| return ( |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| create_empty_delta(), |
| ) |
| |
| log_dashboard_event("DASHBOARD_COMPARE_MODELS", extra={"count": len(run_ids)}) |
| |
| comparison_data, delta_data = dashboard.get_comparison_data(run_ids) |
| |
| table_data = update_comparison_table(comparison_data) |
| delta_fig = update_delta_chart(delta_data) |
| |
| return table_data, delta_fig |
|
|
| def on_refresh_models(): |
| """Handle refresh models button.""" |
| return load_model_names() |
| |
| def on_export(run_id, format: str, include_config: bool): |
| """Handle export button.""" |
| |
| if isinstance(run_id, list): |
| run_id = run_id[0] if run_id else None |
| |
| if not run_id: |
| return {"error": "No run selected", "content": ""} |
|
|
| summary = dashboard._data_loader.get_run_summary(run_id) |
|
|
| if summary is None: |
| return {"error": "Run not found", "content": ""} |
|
|
| result = handle_export(summary, format, include_config) |
| |
| |
| if isinstance(result, tuple): |
| output, filename = result |
| |
| if format == "csv": |
| return { |
| "format": "csv", |
| "filename": filename, |
| "content": output if output else "" |
| } |
| return {"content": output, "filename": filename} |
| return result |
|
|
| def load_benchmarks() -> List[str]: |
| """Load available benchmarks.""" |
| try: |
| benchmarks = dashboard._data_loader.list_benchmarks() |
| choices = [b.benchmark_id for b in benchmarks] |
| return choices |
| except Exception as e: |
| logger.error(f"Error loading benchmarks: {e}") |
| return [] |
|
|
| def on_benchmark_select(benchmark_id): |
| """Handle benchmark selection.""" |
| |
| if isinstance(benchmark_id, list): |
| if not benchmark_id: |
| return ( |
| create_empty_delta_chart("Select a benchmark"), |
| create_empty_stability_chart("Select a benchmark"), |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| 0.0, 0.0, 0.0, |
| "N/A", "N/A", |
| 0.0, "N/A", "N/A", 0, |
| ) |
| |
| benchmark_id = benchmark_id[0] |
| |
| if not benchmark_id: |
| return ( |
| create_empty_delta_chart("Select a benchmark"), |
| create_empty_stability_chart("Select a benchmark"), |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| 0.0, 0.0, 0.0, |
| "N/A", "N/A", |
| 0.0, "N/A", "N/A", 0, |
| ) |
|
|
| log_dashboard_event("DASHBOARD_VIEW_BENCHMARK", extra={"benchmark_id": benchmark_id}) |
|
|
| |
| comparison = dashboard._data_loader.get_benchmark_comparison(benchmark_id) |
| stats = dashboard._data_loader.get_benchmark_stats(benchmark_id) |
|
|
| if comparison is None or stats is None: |
| return ( |
| create_empty_delta_chart("Benchmark not found"), |
| create_empty_stability_chart("Benchmark not found"), |
| [["N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "N/A", "0"]], |
| 0.0, 0.0, 0.0, |
| "N/A", "N/A", |
| 0.0, "N/A", "N/A", 0, |
| ) |
|
|
| |
| delta_fig = update_delta_bar_chart(comparison) |
| stability_fig = update_stability_scatter(comparison) |
| ranking_data = update_ranking_table(comparison) |
|
|
| return ( |
| delta_fig, |
| stability_fig, |
| ranking_data, |
| stats.mean_baseline, |
| stats.mean_adversarial, |
| stats.mean_delta, |
| stats.std_baseline, |
| stats.best_model, |
| stats.most_stable, |
| stats.mean_rsi, |
| stats.most_vulnerable, |
| stats.total_models, |
| ) |
|
|
| def on_refresh_benchmarks(): |
| """Handle refresh benchmarks button.""" |
| return load_benchmarks() |
|
|
| def on_refresh_monitoring(): |
| """ |
| Handle monitoring refresh button. |
| |
| Loads latest monitoring data and updates all dashboard components. |
| """ |
| try: |
| log_dashboard_event("DASHBOARD_REFRESH_MONITORING") |
| |
| |
| trends_data = dashboard._data_loader.get_monitoring_trends(window_size=50) |
| |
| |
| alerts_data = dashboard._data_loader.get_active_alerts() |
| |
| |
| drift_data = dashboard._data_loader.get_drift_status() |
| |
| |
| config_data = dashboard._data_loader.get_monitoring_config() |
| |
| |
| timestamps = trends_data.get("timestamps", []) |
| robustness_values = trends_data.get("robustness", []) |
| hallucination_values = trends_data.get("hallucination", []) |
| toxicity_values = trends_data.get("toxicity", []) |
| confidence_values = trends_data.get("confidence", []) |
| |
| |
| rolling_h = trends_data.get("rolling_hallucination", 0.0) |
| rolling_t = trends_data.get("rolling_toxicity", 0.0) |
| rolling_b = trends_data.get("rolling_bias", 0.0) |
| rolling_c = trends_data.get("rolling_confidence", 0.0) |
| rolling_r = trends_data.get("rolling_robustness", 0.0) |
| |
| |
| sample_count = 0 |
| try: |
| from backend.monitoring.pipeline import get_monitoring_pipeline |
| pipeline = get_monitoring_pipeline() |
| sample_count = pipeline.get_sample_count() |
| except Exception: |
| pass |
| |
| |
| baseline_r = config_data.get("robustness_threshold", 0.75) |
| robustness_fig = create_robustness_trend_chart( |
| robustness_values, |
| timestamps, |
| baseline=baseline_r, |
| ) |
| |
| h_threshold = config_data.get("hallucination_threshold", 0.08) |
| hallucination_fig = create_hallucination_trend_chart( |
| hallucination_values, |
| timestamps, |
| threshold=h_threshold, |
| ) |
| |
| t_threshold = config_data.get("toxicity_threshold", 0.05) |
| toxicity_fig = create_toxicity_trend_chart( |
| toxicity_values, |
| timestamps, |
| threshold=t_threshold, |
| ) |
| |
| c_threshold = config_data.get("confidence_threshold", 0.15) |
| confidence_fig = create_confidence_trend_chart( |
| confidence_values, |
| timestamps, |
| min_confidence=c_threshold, |
| ) |
| |
| |
| h_drift = drift_data.get("hallucination", {}) |
| t_drift = drift_data.get("toxicity", {}) |
| b_drift = drift_data.get("bias", {}) |
| c_drift = drift_data.get("confidence", {}) |
| r_drift = drift_data.get("robustness", {}) |
| |
| h_drift_status = f"Drift: {h_drift.get('magnitude', 0.0):.3f}" if h_drift.get("is_drift") else "No drift" |
| t_drift_status = f"Drift: {t_drift.get('magnitude', 0.0):.3f}" if t_drift.get("is_drift") else "No drift" |
| b_drift_status = f"Drift: {b_drift.get('magnitude', 0.0):.3f}" if b_drift.get("is_drift") else "No drift" |
| c_drift_status = f"Collapse: {c_drift.get('magnitude', 0.0):.3f}" if c_drift.get("is_drift") else "No drift" |
| r_drift_status = f"Collapse: {r_drift.get('magnitude', 0.0):.3f}" if r_drift.get("is_drift") else "No drift" |
| |
| |
| alerts = alerts_data.get("alerts", []) |
| total_alerts = alerts_data.get("total", 0) |
| |
| critical_count = sum(1 for a in alerts if a.get("severity") == "critical") |
| high_count = sum(1 for a in alerts if a.get("severity") == "high") |
| medium_count = sum(1 for a in alerts if a.get("severity") == "medium") |
| low_count = sum(1 for a in alerts if a.get("severity") == "low") |
| |
| |
| alert_rows = format_alert_for_table(alerts) |
| |
| |
| if not alert_rows or not isinstance(alert_rows, list): |
| alert_rows = [["No alerts", "-", "-", "-", "-", "-", "-"]] |
| elif not isinstance(alert_rows[0], list): |
| alert_rows = [alert_rows] |
| |
| return ( |
| sample_count, |
| rolling_h, |
| rolling_t, |
| rolling_b, |
| rolling_c, |
| rolling_r, |
| robustness_fig, |
| hallucination_fig, |
| toxicity_fig, |
| confidence_fig, |
| h_drift_status, |
| t_drift_status, |
| b_drift_status, |
| c_drift_status, |
| r_drift_status, |
| str(total_alerts), |
| str(critical_count), |
| str(high_count), |
| str(medium_count), |
| alert_rows, |
| ) |
| |
| except Exception as e: |
| logger.error(f"Error refreshing monitoring: {e}") |
| |
| return ( |
| 0, 0.0, 0.0, 0.0, 0.0, 0.0, |
| create_empty_trend_chart("Robustness", "Error loading"), |
| create_empty_trend_chart("Hallucination", "Error loading"), |
| create_empty_trend_chart("Toxicity", "Error loading"), |
| create_empty_trend_chart("Confidence", "Error loading"), |
| "Error", "Error", "Error", "Error", "Error", |
| 0, 0, 0, 0, 0, |
| [["Error", "-", "-", "-", "-", "-", "-"]], |
| ) |
| |
| |
| run_dropdown.change( |
| fn=on_run_select, |
| inputs=[run_dropdown], |
| outputs=[ |
| radar_plot, |
| composite_score, vulnerability_index, sample_count, |
| heatmap_plot, |
| attack_selector, |
| attack_breakdown_table, |
| metrics_table, |
| ], |
| ) |
|
|
| refresh_btn.click( |
| fn=on_refresh, |
| inputs=[], |
| outputs=[run_dropdown], |
| ) |
|
|
| compare_btn.click( |
| fn=on_compare, |
| inputs=[model_multiselect], |
| outputs=[comparison_table, delta_plot], |
| ) |
|
|
| refresh_models_btn.click( |
| fn=on_refresh_models, |
| inputs=[], |
| outputs=[model_multiselect], |
| ) |
|
|
| export_btn.click( |
| fn=on_export, |
| inputs=[run_dropdown, export_format, include_config], |
| outputs=[export_output], |
| ) |
|
|
| |
| benchmark_dropdown.change( |
| fn=on_benchmark_select, |
| inputs=[benchmark_dropdown], |
| outputs=[ |
| delta_plot, |
| stability_plot, |
| ranking_table, |
| mean_baseline, |
| mean_adversarial, |
| mean_delta, |
| std_baseline, |
| best_model, |
| most_stable, |
| mean_rsi, |
| most_vulnerable, |
| total_models, |
| ], |
| ) |
|
|
| refresh_benchmarks_btn.click( |
| fn=on_refresh_benchmarks, |
| inputs=[], |
| outputs=[benchmark_dropdown], |
| ) |
|
|
| |
| refresh_monitoring_btn.click( |
| fn=on_refresh_monitoring, |
| inputs=[], |
| outputs=[ |
| samples_processed, |
| rolling_hallucination, |
| rolling_toxicity, |
| rolling_bias, |
| rolling_confidence, |
| rolling_robustness, |
| robustness_trend_plot, |
| hallucination_trend_plot, |
| toxicity_trend_plot, |
| confidence_trend_plot, |
| hallucination_drift, |
| toxicity_drift, |
| bias_drift, |
| confidence_drift, |
| robustness_drift, |
| alert_critical, |
| alert_high, |
| alert_medium, |
| alert_low, |
| alerts_table, |
| ], |
| ) |
|
|
| |
| def load_demo_data(): |
| """Load demo data for all tabs on startup.""" |
| |
| monitoring_data = on_refresh_monitoring() |
| |
| |
| |
| |
| |
|
|
| |
| governance_data = [ |
| ["2024-01-15 10:30:00", "threshold_update", "admin", "Updated hallucination threshold to 0.15", "abc123"], |
| ["2024-01-16 14:20:00", "policy_change", "admin", "Activated new safety policy v1.1", "def456"], |
| ["2024-01-17 09:15:00", "audit_review", "auditor", "Quarterly compliance audit completed", "ghi789"], |
| ] |
|
|
| threshold_history_data = [ |
| ["2024-01-15", "0.12", "0.15", "admin"], |
| ["2024-01-20", "0.15", "0.18", "admin"], |
| ["2024-02-01", "0.18", "0.16", "admin"], |
| ] |
|
|
| |
| criteria_data = [ |
| ["GSS-1", "0.85", "0.15", "Basic certification"], |
| ["GSS-2", "0.90", "0.10", "Advanced certification"], |
| ["GSS-3", "0.95", "0.05", "Expert certification"], |
| ] |
|
|
| cert_history_data = [ |
| ["2024-01-15", "gpt-4", "0.87", "GSS-1", "2025-01-15"], |
| ["2024-01-20", "claude-3", "0.91", "GSS-2", "2025-01-20"], |
| ["2024-02-01", "llama-2", "0.82", "GSS-1", "2025-02-01"], |
| ] |
|
|
| |
| plan_data = [ |
| ["Free", "10", "$0/month", "Basic evaluation"], |
| ["Basic", "50", "$99/month", "Advanced metrics"], |
| ["Pro", "200", "$299/month", "Full governance"], |
| ["Enterprise", "Unlimited", "$999/month", "Custom deployment"], |
| ] |
|
|
| |
| control_data = [ |
| ["CC1.1", "Access Control", "Pass", "User authentication implemented"], |
| ["CC2.2", "Encryption", "Pass", "AES-256 encryption at rest"], |
| ["CC3.3", "Logging", "Pass", "Structured JSON logging"], |
| ["CC4.4", "Monitoring", "Pass", "Real-time drift detection"], |
| ] |
|
|
| eu_data = [ |
| ["Article 10", "Data Governance", "Compliant", "Data quality monitoring"], |
| ["Article 14", "Transparency", "Compliant", "Model cards provided"], |
| ["Article 15", "Human Oversight", "Compliant", "Human-in-the-loop"], |
| ] |
|
|
| |
| authority_data = [ |
| ["US NIST", "North America", "45", "Active"], |
| ["EU ENISA", "Europe", "32", "Active"], |
| ["UK NCSC", "Europe", "28", "Active"], |
| ] |
|
|
| plugin_data = [ |
| ["Toxicity Detector", "AegisLM", "Safety", "4.8", "1250"], |
| ["Bias Analyzer", "AegisLM", "Fairness", "4.6", "980"], |
| ["Hallucination Checker", "AegisLM", "Accuracy", "4.9", "2100"], |
| ] |
|
|
| |
| checklist_data = [ |
| ["Database", "Pass", "PostgreSQL configured"], |
| ["Redis", "Pass", "Rate limiting active"], |
| ["Security", "Pass", "All controls implemented"], |
| ["Monitoring", "Pass", "Dashboards operational"], |
| ] |
|
|
| deploy_data = [ |
| ["2024-01-15", "v1.0.0", "Success", "2m 30s"], |
| ["2024-01-20", "v1.0.1", "Success", "1m 45s"], |
| ["2024-02-01", "v1.1.0", "Success", "3m 15s"], |
| ] |
|
|
| return ( |
| |
| load_runs(), load_model_names(), load_benchmarks(), |
| |
| *monitoring_data, |
| |
| governance_data, threshold_history_data, |
| |
| criteria_data, cert_history_data, |
| |
| 15, 120, 4500, 18000, plan_data, 1250, 45, 2.3, |
| |
| control_data, eu_data, |
| |
| authority_data, plugin_data, |
| |
| checklist_data, deploy_data, |
| ) |
|
|
| |
| app.load( |
| fn=load_demo_data, |
| inputs=[], |
| outputs=[ |
| run_dropdown, model_multiselect, benchmark_dropdown, |
| |
| samples_processed, rolling_hallucination, rolling_toxicity, rolling_bias, |
| rolling_confidence, rolling_robustness, robustness_trend_plot, |
| hallucination_trend_plot, toxicity_trend_plot, confidence_trend_plot, |
| hallucination_drift, toxicity_drift, bias_drift, confidence_drift, |
| robustness_drift, alert_critical, alert_high, alert_medium, alert_low, alerts_table, |
| |
| audit_log, threshold_history, |
| |
| criteria_table, cert_history, |
| |
| tenant_count, active_users, mrr, arr, plan_breakdown, api_calls, eval_runs, data_processed, |
| |
| control_table, eu_table, |
| |
| authority_table, plugin_table, |
| |
| checklist, deploy_history, |
| ], |
| ) |
| |
| return app |
|
|
|
|
| |
| |
| |
|
|
|
|
| def create_empty_radar(): |
| """Create empty radar chart.""" |
| import plotly.graph_objects as go |
| |
| fig = go.Figure() |
| fig.update_layout( |
| title="Select a run to view radar chart", |
| polar=dict( |
| radialaxis=dict(visible=True, range=[0, 1]), |
| ), |
| height=400, |
| width=400, |
| ) |
| return fig |
|
|
|
|
| def create_empty_heatmap(): |
| """Create empty heatmap.""" |
| import plotly.graph_objects as go |
| |
| fig = go.Figure() |
| fig.update_layout( |
| title="Select a run to view vulnerability heatmap", |
| xaxis=dict(title="Metrics"), |
| yaxis=dict(title="Attack Types"), |
| height=400, |
| width=600, |
| ) |
| return fig |
|
|
|
|
| def create_empty_delta(): |
| """Create empty delta chart.""" |
| import plotly.graph_objects as go |
| |
| fig = go.Figure() |
| fig.update_layout( |
| title="Select models to compare", |
| xaxis=dict(title="Model"), |
| yaxis=dict(title="Delta Robustness"), |
| height=400, |
| width=600, |
| ) |
| return fig |
|
|
|
|
| |
| |
| |
|
|
|
|
| def main(): |
| """Main entry point for the dashboard.""" |
| import argparse |
| |
| parser = argparse.ArgumentParser(description="AegisLM Dashboard") |
| parser.add_argument( |
| "--host", |
| type=str, |
| default="0.0.0.0", |
| help="Host to bind to", |
| ) |
| parser.add_argument( |
| "--port", |
| type=int, |
| default=7860, |
| help="Port to bind to", |
| ) |
| parser.add_argument( |
| "--demo", |
| action="store_true", |
| help="Enable demo mode with sample data", |
| ) |
| parser.add_argument( |
| "--debug", |
| action="store_true", |
| help="Enable debug mode", |
| ) |
| |
| args = parser.parse_args() |
| |
| |
| logging.basicConfig( |
| level=logging.DEBUG if args.debug else logging.INFO, |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", |
| ) |
| |
| |
| app = create_dashboard(demo_mode=args.demo) |
| |
| |
| app.launch( |
| server_name=args.host, |
| server_port=args.port, |
| share=args.debug, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|