"""FastAPI entry-point for the Incident Command Center environment. Besides the OpenEnv contract endpoints (`/reset`, `/step`, `/state`, `/close`) registered by `create_fastapi_app`, this module exposes: - `GET /` and `GET /web` — interactive HTML dashboard. - `GET /healthz` — liveness / readiness probe for orchestrators. - `GET /version` — build metadata. - `GET /metadata` — static environment metadata (action space, reward model). - `GET /metrics` — lightweight in-process counters (best-effort). The dashboard is written inline so the environment ships as a single directory and can be embedded in Hugging Face Spaces without extra assets. """ from __future__ import annotations import json import logging from pathlib import Path from typing import Any, Dict import uvicorn from fastapi.responses import HTMLResponse, JSONResponse, PlainTextResponse from fastapi.staticfiles import StaticFiles from openenv.core.env_server import create_fastapi_app from models import IncidentAction, IncidentObservation from server.config import EnvConfig from server.domain import ALL_ACTIONS, ALL_ROLES, build_incident_library from server.domain.reward import ( CLOSURE_CORRECT_BASE, CLOSURE_WRONG_PENALTY, CLUE_REWARD, HANDOFF_CORRECT_REWARD, MITIGATION_CORRECT_REWARD, STEP_COST_INVESTIGATION, TIER_MULTIPLIER, ) from server.environment import IncidentCommandCenterEnvironment from server.logging_utils import configure_logging _LOG = logging.getLogger("icc.app") _CONFIG = EnvConfig.from_env() configure_logging(level=_CONFIG.log_level, structured=_CONFIG.structured_logging) # External URLs surfaced on the dashboard so judges can jump straight from # the HF Space to the GitHub / Colab / docs / training artifacts. GITHUB_URL = "https://github.com/SwapnilPatil28/Multi-Agent-Incident-Command-Center" SPACE_PAGE_URL = "https://huggingface.co/spaces/SwapnilPatil28/Multi-Agent-Incident-Command-Center" SPACE_APP_URL = "https://swapnilpatil28-multi-agent-incident-command-center.hf.space" COLAB_URL = "https://colab.research.google.com/drive/1vx9E5FrZZrHoRwXs2cvtom3DaI6kZ3LP?usp=sharing" # Dashboard doc links point at the Hugging Face Space copies of the docs (not # GitHub) so a judge who opens the Space stays inside the HF ecosystem. The # README on the Space page is rendered directly, so we point at the Space # root for it; the other three open the HF file browser. README_URL = f"{SPACE_PAGE_URL}/blob/main/README.md" BLOG_POST_URL = f"{SPACE_PAGE_URL}/blob/main/docs/BLOG_POST.md" SUBMISSION_CHECKLIST_URL = f"{SPACE_PAGE_URL}/blob/main/docs/SUBMISSION_CHECKLIST.md" app = create_fastapi_app( IncidentCommandCenterEnvironment, IncidentAction, IncidentObservation, ) # Serve the committed training-evidence artifacts (reward_curve.png, # training_curve.png, reward_components.png, summary_metrics.json, ...) # so the dashboard can embed them without depending on external hosts. _ARTIFACTS_DIR = Path(__file__).resolve().parent.parent / "artifacts" if _ARTIFACTS_DIR.exists(): app.mount( "/artifacts", StaticFiles(directory=str(_ARTIFACTS_DIR)), name="artifacts", ) def _load_summary_metrics() -> Dict[str, Any]: """Best-effort load of the committed training results for the dashboard.""" path = _ARTIFACTS_DIR / "summary_metrics.json" if not path.exists(): return {} try: with path.open("r", encoding="utf-8") as fh: return json.load(fh) except (OSError, json.JSONDecodeError): return {} # --------------------------------------------------------------------------- # Introspection helpers # --------------------------------------------------------------------------- def _resolve_environment() -> IncidentCommandCenterEnvironment | None: """Best-effort retrieval of the running environment instance. OpenEnv versions differ in where they stash the environment, so we try a few well-known attribute names before giving up. """ for attr in ("environment", "env", "_environment"): env = getattr(app.state, attr, None) if env is not None: return env # type: ignore[return-value] return None def _metadata_payload() -> Dict[str, Any]: library = build_incident_library() return { "name": _CONFIG.name, "version": _CONFIG.version, "tasks": library.tasks(), "incidents_per_task": { task: len(library.templates_for(task)) for task in library.tasks() }, "actions": list(ALL_ACTIONS), "roles": list(ALL_ROLES), "reward_model": { "step_cost_investigation": STEP_COST_INVESTIGATION, "clue_reward": CLUE_REWARD, "handoff_correct": HANDOFF_CORRECT_REWARD, "mitigation_correct": MITIGATION_CORRECT_REWARD, "closure_correct_base": CLOSURE_CORRECT_BASE, "closure_wrong": CLOSURE_WRONG_PENALTY, "tier_multiplier": TIER_MULTIPLIER, }, "budgets": { "easy": _CONFIG.easy_budget, "medium": _CONFIG.medium_budget, "hard": _CONFIG.hard_budget, }, "sla_minutes": { "easy": _CONFIG.easy_sla_minutes, "medium": _CONFIG.medium_sla_minutes, "hard": _CONFIG.hard_sla_minutes, }, } # --------------------------------------------------------------------------- # Routes # --------------------------------------------------------------------------- @app.get("/healthz", response_class=JSONResponse) async def healthz() -> JSONResponse: return JSONResponse( { "status": "ok", "name": _CONFIG.name, "version": _CONFIG.version, } ) @app.get("/version", response_class=JSONResponse) async def version() -> JSONResponse: return JSONResponse( { "name": _CONFIG.name, "version": _CONFIG.version, "default_seed": _CONFIG.default_seed, } ) @app.get("/env-info", response_class=JSONResponse) async def env_info() -> JSONResponse: """Rich metadata about the environment (rubric, budgets, taxonomy).""" return JSONResponse(_metadata_payload()) @app.get("/metrics", response_class=PlainTextResponse) async def metrics() -> PlainTextResponse: env = _resolve_environment() lines = [ f'icc_info{{name="{_CONFIG.name}",version="{_CONFIG.version}"}} 1', ] if env is not None and env.state is not None: s = env.state lines += [ f'icc_episode_step_total {s.step_count}', f'icc_cumulative_reward {s.cumulative_reward}', f'icc_incidents_resolved_total {s.incidents_resolved}', f'icc_incidents_failed_total {s.incidents_failed}', f'icc_budget_remaining {s.budget_remaining}', f'icc_sla_minutes_remaining {s.sla_minutes_remaining}', f'icc_current_incident_index {s.current_incident_index}', ] return PlainTextResponse("\n".join(lines) + "\n") @app.get("/", response_class=HTMLResponse) @app.get("/web", response_class=HTMLResponse) async def root() -> HTMLResponse: return HTMLResponse(_dashboard_html()) def _dashboard_html() -> str: metadata_json = json.dumps(_metadata_payload(), indent=2) metrics = _load_summary_metrics() artifacts_available = _ARTIFACTS_DIR.exists() and ( _ARTIFACTS_DIR / "reward_curve.png" ).exists() # --- Headline training numbers (1.5B SFT vs base, hard task) ------------- base_rewards = metrics.get("base_model_rewards") or [0.0, 0.0, 0.0] sft_rewards = metrics.get("sft_model_rewards") or [0.0, 0.0, 0.0] improvement = metrics.get("improvement_sft_over_base") or [0.0, 0.0, 0.0] headline_delta = improvement[2] if len(improvement) >= 3 else 0.0 def _fmt(val: Any) -> str: try: return f"{float(val):+.2f}" except (TypeError, ValueError): return "—" training_rows = "".join( f"
Committed artifacts from the reference training run (Qwen2.5-1.5B-Instruct, 8 episodes/task, 3 epochs) plus the Qwen2.5-0.5B-Instruct ablation. Click any plot to open it full-size.
~2.84 → ~0.02 and next-token accuracy climbs from
~0.49 → ~0.99 over three epochs on 680 rollout tokens.
clue_bonus,
mitigation_correct, closure_correct,
speed_bonus) while the base model stalls on
step_cost and SLA penalties.
Raw files: summary_metrics.json · training_log.json · summary_metrics_qwen0p5b.json
""".format(hard=_fmt(headline_delta)) else: plots_html = ( "Plots not bundled in this image. " "See the GitHub artifacts folder.
Same pipeline, same data schema — only the base-model size differs. The 0.5B model cannot absorb the expert policy; 1.5B matches it exactly.
| Model | Easy Δ | Medium Δ | Hard Δ | Heuristic match? |
|---|---|---|---|---|
| Qwen2.5-0.5B-Instruct | +0.43 | +0.14 | +0.00 | No (stuck on step-cost) |
| Qwen2.5-1.5B-Instruct | -1.80 | +3.13 | +10.17 | Yes (exact match) |
{name}
When a real tech company has an outage, three people's phones
buzz at once — a Triage engineer, an Investigator, and an Ops
Manager. They have to cooperate under a ticking SLA clock,
every action costs budget, and every wrong call costs
real money (enterprise outages hurt ~3× more than free-tier).
We built a simulator of that war room — and we fine-tuned an LLM to run it
as well as the human expert.
Three specialist agents with different permissions resolve a live queue of 13 realistic tech incidents across 3 difficulty tiers.
| Role | Can do | Cannot do |
|---|---|---|
| 🔍 Triage | Pull logs · check metrics · consult KB | Close a ticket |
| 🧪 Investigator | Apply a fix · roll back a deploy | Escalate or file a post-mortem |
| 👷 Ops Manager | Escalate · file post-mortem · close the ticket | Apply a code fix |
Not "pick the right label." It learned a whole workflow — dig up clues, hand off to the right specialist, apply the correct fix, respect the SLA, file the post-mortem, close the ticket. The rubric makes every piece of that workflow visible as a named reward component, so you can see why the agent earned (or lost) points at every step.
Three distinct roles with non-overlapping permissions.
Wrong-actor calls → -0.08. Correct handoff → +0.15.
Cooperation is trained, not hard-coded.
Each episode runs 3–5 sequential incidents over 20–60 steps with a single ticking SLA clock. Big rewards (+0.80 × tier) only fire after clues → fix → post-mortem. Sparse and delayed by design.
Real logs, metrics, KB articles, red-herring signals, customer tiers, SLA timers, revenue impact. Close an enterprise ticket wrong and it hurts ~3× what a free-tier one does.
↓ Keep scrolling for the headline numbers, training plots, ablation, and the full rubric. Or jump straight to the README or the blog post.
| Task tier | Base reward | SFT reward | Δ |
|---|
Numbers loaded live from summary_metrics.json committed alongside this Space.
Standard OpenEnv contract plus operational endpoints.
POST /reset — start a new episode (task_name, seed).POST /step — submit an IncidentAction.GET /state — full environment state.GET /healthz — liveness probe.GET /version — build information.GET /env-info — action space, reward model, budgets.GET /metrics — Prometheus-style counters.GET /docs — interactive OpenAPI documentation.GET /artifacts/… — committed training plots & metrics.Each action is gated by the acting role; wrong-actor calls are penalised.
Composable rubric with anti-gaming safeguards. Every step returns a
reward_components dictionary so training curves are
interpretable. Closure rewards and SLA penalties are scaled by
customer-tier multipliers:
{"".join(f"{tier}: x{mult}" for tier, mult in TIER_MULTIPLIER.items())}
| Component | Signal |
|---|
Full rubric (invalid-action, repeated-lookup, rollback-effective, post-mortem-logged, etc.) is documented in the README.
{metadata_json}