Spaces:
Running
Running
| """SentinelOps Arena -- HuggingFace Spaces Gradio App. | |
| Multi-agent self-play RL environment for enterprise security training. | |
| Three AI agents (Attacker, Worker, Oversight) interact with simulated | |
| enterprise systems (CRM, Billing, Ticketing). | |
| Built with Gradio 6 -- custom cybersecurity theme, native plots, rich HTML. | |
| """ | |
| import json | |
| import gradio as gr | |
| import pandas as pd | |
| from sentinelops_arena.demo import run_comparison, run_episode | |
| from sentinelops_arena.environment import SentinelOpsArena | |
| from sentinelops_arena.metrics import ( | |
| compute_episode_metrics, | |
| format_metrics_html, | |
| format_comparison_metrics_html, | |
| ) | |
| from sentinel_theme import SentinelTheme, CUSTOM_CSS, HEADER_HTML | |
| from replay_html import format_replay_html | |
| from chart_helpers import ( | |
| build_score_progression_df, | |
| build_attack_timeline_df, | |
| build_comparison_df, | |
| build_verdict_html, | |
| format_scores_html, | |
| format_comparison_scores_html, | |
| ) | |
| from inspector import ( | |
| get_all_customers, | |
| get_all_invoices, | |
| get_all_tickets, | |
| get_task_queue, | |
| get_env_config_html, | |
| ) | |
| # ------------------------------------------------------------------- | |
| # Handler functions | |
| # ------------------------------------------------------------------- | |
| def run_single_episode(seed, trained): | |
| """Run a single episode and return formatted replay + charts + metrics.""" | |
| log, scores = run_episode(trained=bool(trained), seed=int(seed)) | |
| html = format_replay_html(log, scores) | |
| scores_html = format_scores_html(scores) | |
| metrics = compute_episode_metrics(log) | |
| metrics_html = format_metrics_html(metrics) | |
| score_df = build_score_progression_df(log) | |
| attack_df = build_attack_timeline_df(log) | |
| return html, scores_html, metrics_html, score_df, attack_df | |
| def run_before_after(seed): | |
| """Run comparison between untrained and trained worker.""" | |
| result = run_comparison(seed=int(seed)) | |
| untrained_html = format_replay_html( | |
| result["untrained"]["log"], result["untrained"]["scores"] | |
| ) | |
| trained_html = format_replay_html( | |
| result["trained"]["log"], result["trained"]["scores"] | |
| ) | |
| comparison_df = build_comparison_df( | |
| result["untrained"]["scores"], result["trained"]["scores"] | |
| ) | |
| verdict_html = build_verdict_html( | |
| result["untrained"]["log"], result["trained"]["log"] | |
| ) | |
| # Score progression for both | |
| untrained_score_df = build_score_progression_df(result["untrained"]["log"]) | |
| trained_score_df = build_score_progression_df(result["trained"]["log"]) | |
| comparison_html = format_comparison_scores_html( | |
| result["untrained"]["scores"], result["trained"]["scores"] | |
| ) | |
| untrained_metrics = compute_episode_metrics(result["untrained"]["log"]) | |
| trained_metrics = compute_episode_metrics(result["trained"]["log"]) | |
| comp_metrics_html = format_comparison_metrics_html( | |
| untrained_metrics, trained_metrics | |
| ) | |
| return ( | |
| untrained_html, | |
| trained_html, | |
| verdict_html, | |
| comparison_df, | |
| untrained_score_df, | |
| trained_score_df, | |
| comparison_html, | |
| comp_metrics_html, | |
| ) | |
| def inspect_state(seed): | |
| """Show full environment state after reset.""" | |
| env = SentinelOpsArena() | |
| env.reset(seed=int(seed)) | |
| config_html = get_env_config_html(env) | |
| customers_df = get_all_customers(env) | |
| invoices_df = get_all_invoices(env) | |
| tickets_df = get_all_tickets(env) | |
| tasks_df = get_task_queue(env) | |
| return config_html, customers_df, invoices_df, tickets_df, tasks_df | |
| # ------------------------------------------------------------------- | |
| # Gradio UI | |
| # ------------------------------------------------------------------- | |
| with gr.Blocks(title="SentinelOps Arena", fill_width=True) as demo: | |
| # Header banner | |
| gr.HTML(HEADER_HTML) | |
| with gr.Tabs(): | |
| # ============================================================ | |
| # Tab 1: Run Episode | |
| # ============================================================ | |
| with gr.TabItem("Run Episode"): | |
| with gr.Row(): | |
| # Left sidebar for controls | |
| with gr.Column(scale=1, min_width=300): | |
| gr.Markdown("### Episode Configuration") | |
| seed_input = gr.Number( | |
| value=42, label="Random Seed", precision=0, | |
| info="Seed for generating customer scenarios and attack patterns." | |
| ) | |
| trained_toggle = gr.Checkbox( | |
| value=False, label="Use Trained Worker", | |
| info="Toggle to use a worker trained via GRPO instead of a naive heuristic worker." | |
| ) | |
| run_btn = gr.Button("▶ Run Episode", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| gr.Markdown("### Final Scores") | |
| scores_output = gr.HTML(elem_classes=["glow-card"]) | |
| gr.Markdown("---") | |
| gr.Markdown("### Security Metrics") | |
| metrics_output = gr.HTML(elem_classes=["glow-card"]) | |
| # Main content area | |
| with gr.Column(scale=3): | |
| with gr.Tabs(): | |
| with gr.TabItem("Execution Replay"): | |
| replay_output = gr.HTML(elem_classes=["glow-card"]) | |
| with gr.TabItem("Analytics & Timeline"): | |
| with gr.Row(): | |
| score_plot = gr.LinePlot( | |
| x="tick", | |
| y="score", | |
| color="agent", | |
| title="Cumulative Score Progression", | |
| tooltip=["tick", "score", "agent"], | |
| height=350, | |
| ) | |
| with gr.Row(): | |
| attack_plot = gr.BarPlot( | |
| x="attack_type", | |
| y="count", | |
| color="attack_type", | |
| title="Attack Timeline", | |
| tooltip=["attack_type", "count"], | |
| height=350, | |
| ) | |
| run_btn.click( | |
| run_single_episode, | |
| inputs=[seed_input, trained_toggle], | |
| outputs=[replay_output, scores_output, metrics_output, score_plot, attack_plot], | |
| ) | |
| # ============================================================ | |
| # Tab 2: Before/After Comparison | |
| # ============================================================ | |
| with gr.TabItem("Untrained vs Trained"): | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=300): | |
| gr.Markdown( | |
| "### Benchmarking Mode\n" | |
| "Compare how an **untrained** worker vs a **trained** worker " | |
| "handles the same attack sequence." | |
| ) | |
| comp_seed = gr.Number( | |
| value=42, label="Random Seed", precision=0, | |
| info="Ensures identical attack sequence for fair comparison." | |
| ) | |
| comp_btn = gr.Button("▶ Run Comparison", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| gr.Markdown("### Training Impact") | |
| verdict_output = gr.HTML(elem_classes=["glow-card"]) | |
| comparison_output = gr.HTML(elem_classes=["glow-card"]) | |
| gr.Markdown("---") | |
| gr.Markdown("### Security Metrics") | |
| comp_metrics_output = gr.HTML(elem_classes=["glow-card"]) | |
| with gr.Column(scale=3): | |
| with gr.Tabs(): | |
| with gr.TabItem("Execution Replays"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("#### 🛑 Untrained Worker") | |
| untrained_output = gr.HTML(elem_classes=["glow-card"]) | |
| with gr.Column(): | |
| gr.Markdown("#### 🚀 Trained Worker") | |
| trained_output = gr.HTML(elem_classes=["glow-card"]) | |
| with gr.TabItem("Score Analytics"): | |
| with gr.Row(): | |
| comparison_bar = gr.BarPlot( | |
| x="agent", | |
| y="score", | |
| color="type", | |
| title="Score Comparison: Untrained vs Trained", | |
| tooltip=["agent", "score", "type"], | |
| height=350, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| untrained_score_plot = gr.LinePlot( | |
| x="tick", | |
| y="score", | |
| color="agent", | |
| title="Untrained Score Progression", | |
| tooltip=["tick", "score", "agent"], | |
| height=300, | |
| ) | |
| with gr.Column(): | |
| trained_score_plot = gr.LinePlot( | |
| x="tick", | |
| y="score", | |
| color="agent", | |
| title="Trained Score Progression", | |
| tooltip=["tick", "score", "agent"], | |
| height=300, | |
| ) | |
| comp_btn.click( | |
| run_before_after, | |
| inputs=[comp_seed], | |
| outputs=[ | |
| untrained_output, | |
| trained_output, | |
| verdict_output, | |
| comparison_bar, | |
| untrained_score_plot, | |
| trained_score_plot, | |
| comparison_output, | |
| comp_metrics_output, | |
| ], | |
| ) | |
| # ============================================================ | |
| # Tab 3: Environment Inspector | |
| # ============================================================ | |
| with gr.TabItem("Environment Inspector"): | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=300): | |
| gr.Markdown( | |
| "### System Databases\n" | |
| "Inspect the initial state of the simulated enterprise." | |
| ) | |
| inspect_seed = gr.Number( | |
| value=42, label="Random Seed", precision=0, | |
| info="Seed used for procedural generation of records." | |
| ) | |
| inspect_btn = gr.Button("🔍 Inspect Databases", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| config_output = gr.HTML(elem_classes=["glow-card"]) | |
| with gr.Column(scale=3): | |
| with gr.Tabs(): | |
| with gr.TabItem("CRM System (Customers)"): | |
| customers_table = gr.Dataframe( | |
| label="Customer Database", | |
| headers=["customer_id", "name", "tier", "region", "lifetime_value"], | |
| interactive=False, | |
| elem_classes=["glow-card"] | |
| ) | |
| with gr.TabItem("Billing System (Invoices)"): | |
| invoices_table = gr.Dataframe( | |
| label="Invoice Database", | |
| headers=["invoice_id", "customer_id", "amount", "status"], | |
| interactive=False, | |
| elem_classes=["glow-card"] | |
| ) | |
| with gr.TabItem("Ticketing System (Support)"): | |
| tickets_table = gr.Dataframe( | |
| label="Active Tickets", | |
| headers=["ticket_id", "customer_id", "subject", "priority", "status", "sla_deadline_tick"], | |
| interactive=False, | |
| elem_classes=["glow-card"] | |
| ) | |
| with gr.TabItem("Live Task Queue"): | |
| tasks_table = gr.Dataframe( | |
| label="Tasks to Process", | |
| headers=["task_id", "customer_id", "task_type", "message", "arrival_tick"], | |
| interactive=False, | |
| elem_classes=["glow-card"] | |
| ) | |
| inspect_btn.click( | |
| inspect_state, | |
| inputs=[inspect_seed], | |
| outputs=[ | |
| config_output, | |
| customers_table, | |
| invoices_table, | |
| tickets_table, | |
| tasks_table, | |
| ], | |
| ) | |
| # ============================================================ | |
| # Tab 4: About | |
| # ============================================================ | |
| with gr.TabItem("About"): | |
| gr.Markdown( | |
| """ | |
| ## What is SentinelOps Arena? | |
| A realistic enterprise **"red team vs blue team + judge"** RL arena that trains | |
| agents to be both **useful** and **robust to prompt injection**. | |
| Three AI agents interact in a simulated enterprise environment: | |
| 1. **RED TEAM (Attacker)** -- Injects malicious instructions: prompt injections, | |
| policy bypass attempts, fake tool outputs, schema tricks. | |
| 2. **BLUE TEAM (Worker)** -- Completes real enterprise tasks (CRM, Billing, | |
| Ticketing) safely under adversarial pressure. | |
| 3. **AUDITOR (Oversight)** -- Monitors the Worker's steps, flags risky behavior, | |
| and explains failures. | |
| --- | |
| ## Why Self-Play Matters | |
| In normal benchmarks, attacks are **scripted and predictable**. Defenses can | |
| memorize a fixed set of patterns and pass. | |
| In SentinelOps Arena: | |
| - Attacks are **adaptive and randomized** -- the attacker picks types, timing, | |
| targets, and payloads from a large strategy space. | |
| - Both sides **improve over time** through adversarial training, creating a | |
| **natural curriculum** that avoids overfitting. | |
| - The Worker gets reward for completing tasks correctly, and **heavy penalty** | |
| for unsafe actions (data leaks, unauthorized tool calls, policy violations). | |
| - The Attacker gets reward when it causes the Worker to fail. | |
| - The Oversight agent gets reward for catching true failures with **low false | |
| alarm rate**. | |
| --- | |
| ## Architecture | |
| **3 Agents, 3 Systems, 30 Ticks per Episode** | |
| Each tick: Attacker acts → Worker acts → Oversight acts | |
| ### Attack Types | |
| 1. **Schema Drift** -- Renames fields across all records. Worker must detect | |
| KeyError, call `get_schema()`, and adapt. | |
| 2. **Policy Drift** -- Changes business rules (refund windows, approval | |
| requirements). Worker must call `get_current_policy()`. | |
| 3. **Social Engineering** -- Injects fake authority messages with varying | |
| templates. Worker must resist manipulation. | |
| 4. **Rate Limiting** -- Throttles API calls. Worker must handle gracefully. | |
| ### Reward Structure | |
| | Agent | Positive | Negative | | |
| |-------|----------|----------| | |
| | **Worker** | +1.0 task complete & compliant, +0.5 drift detected, +0.2 graceful error | -2.0 policy violation, -3.0 social eng, -0.5 SLA breach | | |
| | **Attacker** | +1.0 worker fail, +1.5 policy violation, +2.5 social eng success | -0.3 per attack (cost) | | |
| | **Oversight** | +1.0 true positive (+0.3 good explanation) | -0.5 false positive, -2.0 missed violation | | |
| --- | |
| ## Key Metrics | |
| | Metric | What it measures | | |
| |--------|-----------------| | |
| | **Attack Success Rate (ASR)** | Fraction of attacks that caused worker failure | | |
| | **Benign Task Success** | Fraction of legitimate tasks completed correctly | | |
| | **False Positive Rate (FPR)** | Fraction of oversight flags that were false alarms | | |
| | **Mean Time to Detect (MTTD)** | Avg ticks between an attack and the worker's defensive response | | |
| | **Social Eng. Resistance** | Fraction of social engineering attacks resisted | | |
| --- | |
| ## Training | |
| Uses **GRPO (Group Relative Policy Optimization)** with Unsloth + TRL. | |
| The Worker agent learns to produce valid JSON actions, detect schema/policy | |
| drift, and resist social engineering -- all through reward shaping in the | |
| SentinelOps environment. | |
| ``` | |
| python train.py --model_name unsloth/Llama-3.2-3B-Instruct --use_unsloth | |
| ``` | |
| --- | |
| ## Partner Tracks | |
| - **Fleet AI**: Scalable Oversight -- the Oversight agent monitors and explains | |
| Worker behavior in real time | |
| - **Patronus AI**: Schema Drift -- schema and policy drift are core attack types | |
| that test the Worker's ability to adapt | |
| --- | |
| ## Tech Stack | |
| OpenEnv 0.2.x | FastMCP | Gradio 6 | HuggingFace TRL | Unsloth | Pydantic | |
| ### Links | |
| - [OpenEnv Framework](https://github.com/meta-pytorch/OpenEnv) | |
| - [GitHub Repository](https://github.com/nihalnihalani/NexusEnv) | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| theme=SentinelTheme(), | |
| css=CUSTOM_CSS, | |
| ) | |