"""FastAPI entry-point for the Incident Command Center environment. Besides the OpenEnv contract endpoints (`/reset`, `/step`, `/state`, `/close`) registered by `create_fastapi_app`, this module exposes: - `GET /` and `GET /web` — interactive HTML dashboard. - `GET /healthz` — liveness / readiness probe for orchestrators. - `GET /version` — build metadata. - `GET /metadata` — static environment metadata (action space, reward model). - `GET /metrics` — lightweight in-process counters (best-effort). The dashboard is written inline so the environment ships as a single directory and can be embedded in Hugging Face Spaces without extra assets. """ from __future__ import annotations import json import logging from pathlib import Path from typing import Any, Dict import uvicorn from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse from fastapi.staticfiles import StaticFiles from openenv.core.env_server import create_fastapi_app from models import IncidentAction, IncidentObservation from server.config import EnvConfig from server.domain import ALL_ACTIONS, ALL_ROLES, build_incident_library from server.domain.reward import ( CLOSURE_CORRECT_BASE, CLOSURE_WRONG_PENALTY, CLUE_REWARD, HANDOFF_CORRECT_REWARD, MITIGATION_CORRECT_REWARD, STEP_COST_INVESTIGATION, TIER_MULTIPLIER, ) from server.environment import IncidentCommandCenterEnvironment from server.logging_utils import configure_logging _LOG = logging.getLogger("icc.app") _CONFIG = EnvConfig.from_env() configure_logging(level=_CONFIG.log_level, structured=_CONFIG.structured_logging) # External URLs surfaced on the dashboard so judges can jump straight from # the HF Space to the GitHub / Colab / docs / training artifacts. GITHUB_URL = "https://github.com/SwapnilPatil28/Multi-Agent-Incident-Command-Center" SPACE_PAGE_URL = "https://huggingface.co/spaces/SwapnilPatil28/Multi-Agent-Incident-Command-Center" SPACE_APP_URL = "https://swapnilpatil28-multi-agent-incident-command-center.hf.space" COLAB_URL = "https://colab.research.google.com/drive/1vx9E5FrZZrHoRwXs2cvtom3DaI6kZ3LP?usp=sharing" # Dashboard doc links point at the Hugging Face Space copies of the docs (not # GitHub) so a judge who opens the Space stays inside the HF ecosystem. The # README on the Space page is rendered directly, so we point at the Space # root for it; the other three open the HF file browser. README_URL = f"{SPACE_PAGE_URL}/blob/main/README.md" BLOG_POST_URL = f"{SPACE_PAGE_URL}/blob/main/docs/BLOG_POST.md" SUBMISSION_CHECKLIST_URL = f"{SPACE_PAGE_URL}/blob/main/docs/SUBMISSION_CHECKLIST.md" app = create_fastapi_app( IncidentCommandCenterEnvironment, IncidentAction, IncidentObservation, ) # Serve the committed training-evidence artifacts (reward_curve.png, # training_curve.png, reward_components.png, summary_metrics.json, ...) # so the dashboard can embed them without depending on external hosts. _ARTIFACTS_DIR = Path(__file__).resolve().parent.parent / "artifacts" if _ARTIFACTS_DIR.exists(): app.mount( "/artifacts", StaticFiles(directory=str(_ARTIFACTS_DIR)), name="artifacts", ) def _load_summary_metrics() -> Dict[str, Any]: """Best-effort load of the committed training results for the dashboard.""" path = _ARTIFACTS_DIR / "summary_metrics.json" if not path.exists(): return {} try: with path.open("r", encoding="utf-8") as fh: return json.load(fh) except (OSError, json.JSONDecodeError): return {} # --------------------------------------------------------------------------- # Introspection helpers # --------------------------------------------------------------------------- def _resolve_environment() -> IncidentCommandCenterEnvironment | None: """Best-effort retrieval of the running environment instance. OpenEnv versions differ in where they stash the environment, so we try a few well-known attribute names before giving up. """ for attr in ("environment", "env", "_environment"): env = getattr(app.state, attr, None) if env is not None: return env # type: ignore[return-value] return None def _metadata_payload() -> Dict[str, Any]: library = build_incident_library() return { "name": _CONFIG.name, "version": _CONFIG.version, "tasks": library.tasks(), "incidents_per_task": { task: len(library.templates_for(task)) for task in library.tasks() }, "actions": list(ALL_ACTIONS), "roles": list(ALL_ROLES), "reward_model": { "step_cost_investigation": STEP_COST_INVESTIGATION, "clue_reward": CLUE_REWARD, "handoff_correct": HANDOFF_CORRECT_REWARD, "mitigation_correct": MITIGATION_CORRECT_REWARD, "closure_correct_base": CLOSURE_CORRECT_BASE, "closure_wrong": CLOSURE_WRONG_PENALTY, "tier_multiplier": TIER_MULTIPLIER, }, "budgets": { "easy": _CONFIG.easy_budget, "medium": _CONFIG.medium_budget, "hard": _CONFIG.hard_budget, }, "sla_minutes": { "easy": _CONFIG.easy_sla_minutes, "medium": _CONFIG.medium_sla_minutes, "hard": _CONFIG.hard_sla_minutes, }, } # --------------------------------------------------------------------------- # Routes # --------------------------------------------------------------------------- @app.get("/healthz", response_class=JSONResponse) async def healthz() -> JSONResponse: return JSONResponse( { "status": "ok", "name": _CONFIG.name, "version": _CONFIG.version, } ) @app.get("/version", response_class=JSONResponse) async def version() -> JSONResponse: return JSONResponse( { "name": _CONFIG.name, "version": _CONFIG.version, "default_seed": _CONFIG.default_seed, } ) @app.get("/env-info", response_class=JSONResponse) async def env_info() -> JSONResponse: """Rich metadata about the environment (rubric, budgets, taxonomy).""" return JSONResponse(_metadata_payload()) @app.get("/metrics", response_class=PlainTextResponse) async def metrics() -> PlainTextResponse: env = _resolve_environment() lines = [ f'icc_info{{name="{_CONFIG.name}",version="{_CONFIG.version}"}} 1', ] if env is not None and env.state is not None: s = env.state lines += [ f'icc_episode_step_total {s.step_count}', f'icc_cumulative_reward {s.cumulative_reward}', f'icc_incidents_resolved_total {s.incidents_resolved}', f'icc_incidents_failed_total {s.incidents_failed}', f'icc_budget_remaining {s.budget_remaining}', f'icc_sla_minutes_remaining {s.sla_minutes_remaining}', f'icc_current_incident_index {s.current_incident_index}', ] return PlainTextResponse("\n".join(lines) + "\n") @app.get("/", response_class=HTMLResponse) @app.get("/web", response_class=HTMLResponse) async def root() -> HTMLResponse: return HTMLResponse(_dashboard_html()) def _dashboard_html() -> str: metadata_json = json.dumps(_metadata_payload(), indent=2) metrics = _load_summary_metrics() artifacts_available = _ARTIFACTS_DIR.exists() and ( _ARTIFACTS_DIR / "reward_curve.png" ).exists() # --- Headline training numbers (1.5B SFT vs base, hard task) ------------- base_rewards = metrics.get("base_model_rewards") or [0.0, 0.0, 0.0] sft_rewards = metrics.get("sft_model_rewards") or [0.0, 0.0, 0.0] improvement = metrics.get("improvement_sft_over_base") or [0.0, 0.0, 0.0] headline_delta = improvement[2] if len(improvement) >= 3 else 0.0 def _fmt(val: Any) -> str: try: return f"{float(val):+.2f}" except (TypeError, ValueError): return "—" training_rows = "".join( f"{tier}{_fmt(base_rewards[idx])}" f"{_fmt(sft_rewards[idx])}" f"{_fmt(improvement[idx])}" for idx, tier in enumerate(("easy", "medium", "hard")) if idx < len(base_rewards) ) # --- Training-evidence block (plots + caption) --------------------------- if artifacts_available: plots_html = """

Training evidence

Committed artifacts from the reference training run (Qwen2.5-1.5B-Instruct, 8 episodes/task, 3 epochs) plus the Qwen2.5-0.5B-Instruct ablation. Click any plot to open it full-size.

Reward curve by policy (1.5B)
1.5B reward curve. Mean episodic reward per task tier across Random / Heuristic / Base-LLM / SFT-LLM. SFT matches the heuristic demonstrator across every tier and outperforms the untuned base by +{hard} on hard incidents.
SFT training loss and token accuracy (1.5B)
1.5B training curve. Supervised loss collapses from ~2.84 → ~0.02 and next-token accuracy climbs from ~0.49 → ~0.99 over three epochs on 680 rollout tokens.
Reward component decomposition (1.5B)
1.5B reward-component breakdown. SFT reproduces the heuristic's positive components (clue_bonus, mitigation_correct, closure_correct, speed_bonus) while the base model stalls on step_cost and SLA penalties.
Reward curve by policy (0.5B ablation)
0.5B ablation reward curve. Same pipeline, smaller backbone. SFT improves by only +0.43 / +0.14 / +0.00 over base — the 0.5B model is too small to absorb the multi-step, role-gated policy. Scale is the story.

Raw files: summary_metrics.json · training_log.json · summary_metrics_qwen0p5b.json

""".format(hard=_fmt(headline_delta)) else: plots_html = ( "

Training evidence

" "

Plots not bundled in this image. " "See the GitHub artifacts folder.

" ) # --- 0.5B ablation summary ---------------------------------------------- ablation_html = """

Ablation: model scale matters for imitation learning

Same pipeline, same data schema — only the base-model size differs. The 0.5B model cannot absorb the expert policy; 1.5B matches it exactly.

ModelEasy ΔMedium ΔHard Δ Heuristic match?
Qwen2.5-0.5B-Instruct +0.43+0.14+0.00 No (stuck on step-cost)
Qwen2.5-1.5B-Instruct -1.80+3.13+10.17 Yes (exact match)
""" # Theme mapping now lives in the top story block — keep this var empty # so the existing `{themes_html}` slot renders to nothing (no duplication). themes_html = "" # --- Reward-rubric details ---------------------------------------------- reward_rubric_rows = "".join( f"{name}{value}" for name, value in ( ("step_cost", f"{STEP_COST_INVESTIGATION} per investigation step"), ("clue_reward", f"+{CLUE_REWARD} per new fact"), ("handoff_correct", f"+{HANDOFF_CORRECT_REWARD}"), ("mitigation_correct", f"+{MITIGATION_CORRECT_REWARD}"), ("closure_correct_base", f"+{CLOSURE_CORRECT_BASE} × tier multiplier"), ("closure_wrong", f"{CLOSURE_WRONG_PENALTY} × tier multiplier"), ) ) return f""" Incident Command Center | OpenEnv Dashboard

Incident Command Center

OpenEnv · Multi-Agent · Long-Horizon · Professional-Task Simulation

🚨 The story in 2 minutes

When a real tech company has an outage, three people's phones buzz at once — a Triage engineer, an Investigator, and an Ops Manager. They have to cooperate under a ticking SLA clock, every action costs budget, and every wrong call costs real money (enterprise outages hurt ~3× more than free-tier).

We built a simulator of that war room — and we fine-tuned an LLM to run it as well as the human expert.

What is the environment?

Three specialist agents with different permissions resolve a live queue of 13 realistic tech incidents across 3 difficulty tiers.

RoleCan doCannot do
🔍 Triage Pull logs · check metrics · consult KB Close a ticket
🧪 Investigator Apply a fix · roll back a deploy Escalate or file a post-mortem
👷 Ops Manager Escalate · file post-mortem · close the ticket Apply a code fix

What did the agent learn?

Not "pick the right label." It learned a whole workflow — dig up clues, hand off to the right specialist, apply the correct fix, respect the SLA, file the post-mortem, close the ticket. The rubric makes every piece of that workflow visible as a named reward component, so you can see why the agent earned (or lost) points at every step.

Why it matters for the 3 hackathon themes

🤝 Theme #1 — Multi-Agent

Three distinct roles with non-overlapping permissions. Wrong-actor calls → -0.08. Correct handoff → +0.15. Cooperation is trained, not hard-coded.

⏱️ Theme #2 — Long-Horizon

Each episode runs 3–5 sequential incidents over 20–60 steps with a single ticking SLA clock. Big rewards (+0.80 × tier) only fire after clues → fix → post-mortem. Sparse and delayed by design.

🏢 Theme #3 — Professional World-Model

Real logs, metrics, KB articles, red-herring signals, customer tiers, SLA timers, revenue impact. Close an enterprise ticket wrong and it hurts ~3× what a free-tier one does.

↓ Keep scrolling for the headline numbers, training plots, ablation, and the full rubric. Or jump straight to the README or the blog post.

Resources & documentation

💻
GitHub repository
Full source, tests, Dockerfile, CI-ready
🤗
Hugging Face Space page
Repo view, build logs, discussions
🟢
Live environment
You are here — OpenEnv endpoints live
🎓
Reproduce training (Colab T4)
One-click notebook, ~1 h wall clock
📖
README (Part 1 + Part 2)
Story overview + full technical deep-dive
📝
Mini blog post
The short writeup — MD file on the HF Space + GitHub
Submission checklist
Every judging rule → where to find the evidence

Headline results

SFT reward lift on hard tasks {_fmt(headline_delta)} vs Qwen2.5-1.5B-Instruct base
Heuristic-policy match Exact SFT clones the demonstrator across every tier
Scale ablation (hard Δ) 0.5B → 1.5B +0.00 → +10.17: capacity matters
Training data 680 rows 24 heuristic rollouts · 3 epochs

Environment at a glance

Incidents in library
Specialist roles 3 triage · investigator · ops manager
Reward components 14+ rubric-based, transparent
Seeded reproducibility Yes default seed {_CONFIG.default_seed}

1.5B SFT vs base (reference run)

{training_rows}
Task tierBase rewardSFT rewardΔ

Numbers loaded live from summary_metrics.json committed alongside this Space.

{plots_html} {ablation_html} {themes_html}

Endpoints

Standard OpenEnv contract plus operational endpoints.

Action space

{"".join(f"{a}" for a in ALL_ACTIONS)}

Each action is gated by the acting role; wrong-actor calls are penalised.

Reward model

Composable rubric with anti-gaming safeguards. Every step returns a reward_components dictionary so training curves are interpretable. Closure rewards and SLA penalties are scaled by customer-tier multipliers:

{"".join(f"{tier}: x{mult}" for tier, mult in TIER_MULTIPLIER.items())}

{reward_rubric_rows}
ComponentSignal

Full rubric (invalid-action, repeated-lookup, rollback-effective, post-mortem-logged, etc.) is documented in the README.

Metadata

{metadata_json}
""" def main() -> None: uvicorn.run(app, host="0.0.0.0", port=8000) if __name__ == "__main__": main()