from __future__ import annotations from fastapi import FastAPI, HTTPException, Request, WebSocket, WebSocketDisconnect from fastapi.responses import HTMLResponse from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import Optional from models import Action, ActionType, Observation, StepResult, State from server.devops_environment import DevOpsEnvironment from collections import deque from datetime import datetime import uuid import statistics from generator.incident_factory import IncidentFactory from curriculum import CurriculumEngine from multi_agent import DualAgentSession _factory = IncidentFactory() curriculum_engine = CurriculumEngine() multi_agent_sessions: dict = {} episode_history = deque(maxlen=1000) replay_store: dict = {} replay_counter: int = 0 current_episode_steps: list = [] def track_episode(state_obj: State): from graders.grader import grade_episode score = grade_episode( task_id=state_obj.task_id, action_history=state_obj.action_history, ground_truth_root_cause=state_obj.ground_truth_root_cause, ground_truth_fix=state_obj.ground_truth_fix, incident_resolved=state_obj.incident_resolved, total_reward=state_obj.total_reward ) info_actions = {"read_logs", "read_metrics", "read_runbook", "search_logs"} info_count = 0 diag_step = None for act in state_obj.action_history: at = act["action"].get("action_type") if at in info_actions: info_count += 1 if at == "diagnose" and diag_step is None: diag_step = act["step"] info_ratio = info_count / len(state_obj.action_history) if state_obj.action_history else 0.0 # Try to extract seed from info or fallback to 42 since seed is lost in State model seed = state_obj.info.get("seed", 42) record = { "episode_id": state_obj.episode_id or str(uuid.uuid4()), "task_id": state_obj.task_id, "seed": seed, "steps_taken": state_obj.step, "incident_resolved": state_obj.incident_resolved, "final_score": float(score), "steps_to_diagnosis": diag_step, "info_gathering_ratio": float(info_ratio), "timestamp": datetime.utcnow().isoformat() + "Z" } episode_history.append(record) # Attempt to load create_web_interface_app, fallback to ordinary FastAPI app try: from openenv.core.env_server import create_web_interface_app HAS_WEB_INTERFACE = True except ImportError: HAS_WEB_INTERFACE = False VALID_TASKS = ("easy", "medium", "hard", "bonus", "security", "database", "failover") _env = DevOpsEnvironment() app = FastAPI( title="DevOps Incident Response — OpenEnv", description="An OpenEnv-compliant RL environment", version="1.0.0", ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) class ResetRequest(BaseModel): task_id: str = "easy" seed: Optional[int] = None class CurriculumRecordRequest(BaseModel): task_id: str score: float class MultiAgentResetRequest(BaseModel): task_id: str seed: int = 42 class AgentAStepRequest(BaseModel): finding: str @app.get("/about") async def about(): return { "name": "ARIA — DevOps Incident Response", "version": "2.0.0", "description": ( "OpenEnv-compliant RL environment for production incident " "response. AI agents diagnose and remediate software incidents " "across 7 task types using 14 actions with dense reward shaping." ), "tasks": 8, "action_types": 14, "themes": [ "World Modeling: Professional Tasks", "Self-Improvement: Curriculum Engine", "Multi-Agent Interactions: Dual-Agent Mode" ], "features": { "curriculum_engine": "Adaptive difficulty based on agent performance. Promotes when avg > 0.75, scaffolds when avg < 0.30.", "incident_generator": "Procedural incidents from seeds 0-99999. 6 failure modes x 8 services x 3 severities.", "dual_agent_mode": "Split observability — Observer sees logs/alerts, Responder sees metrics/deps.", "reward_shaping": "Dense rewards with collateral damage penalties (-0.15), blind remediation penalties (-0.10), semantic diagnosis matching." }, "training": { "model": "Llama-3.1-8B-Instruct", "algorithm": "GRPO (Group Relative Policy Optimization)", "framework": "HuggingFace TRL + Unsloth", "lora_rank": 32, "episodes": 160, "adapter_3b": "https://huggingface.co/Arijit-07/aria-devops-llama3b", "adapter_8b": "https://huggingface.co/Arijit-07/aria-devops-llama8b" }, "reward_design": { "type": "dense", "range": [0.001, 0.999], "gates": { "read_logs_correct": 0.15, "read_metrics": 0.10, "diagnose_full": 0.35, "correct_fix": 0.45, "alert_oncall": 0.15 }, "penalties": { "collateral_damage": -0.15, "blind_remediation": -0.10, "wrong_failover": -0.25, "excessive_noop": -0.04 } }, "links": { "space": "https://arijit-07-devops-incident-response.hf.space", "docs": "https://arijit-07-devops-incident-response.hf.space/docs", "validate": "https://arijit-07-devops-incident-response.hf.space/validate", "github": "https://github.com/Twilight-13/devops-incident-response", "model_3b": "https://huggingface.co/Arijit-07/aria-devops-llama3b", "model_8b": "https://huggingface.co/Arijit-07/aria-devops-llama8b", "blog": "https://huggingface.co/blog/Arijit-07/aria-devops-incident-response" } } @app.get("/live", response_class=HTMLResponse) async def live_dashboard(): html = f""" ARIA NOC LIVE
Incident Response System
OFFLINE
Active Scenario
Seed:
00 / 15
0.000
00:00:00
◈ Infrastructure Status 0
◈ Active Alerts 0
◎ ALL SYSTEMS NOMINAL
◈ Episode Metrics
0.000
STEP 0/15 TASK: -- SEED: --
◈ Agent Reasoning
○ API DISCONNECTED
ⓘ Agents must read_logs before acting — blind remediation triggers -0.10 penalty
""" return HTMLResponse(html) @app.get("/", response_class=HTMLResponse) def dashboard(): html = f""" ARIA - DevOps Incident Response
⚡ OpenEnv Compliant · Meta × PyTorch × HuggingFace

ARIA

Adaptive Reward & Incident Architecture

The first OpenEnv RL environment for production incident response.
7 tasks · 14 actions · Curriculum · Dual-agent · Trained Llama-3.1-8B

7
Tasks
14
Actions
Scenarios
0.99
Max Score

Environment Tasks

Eight scenarios of escalating operational complexity

Loading tasks...

ARIA Features

What makes this environment unique

🎓

Curriculum Engine

Tracks agent performance per task with rolling averages. Promotes when mastered (avg > 0.75). Scaffolds with hints when struggling (avg < 0.30). Agents always train at the edge of their capability.

View Status →

Incident Generator

Procedural incidents from seeds 0–99,999. Six failure modes × eight services × variable noise = infinite unique training scenarios. Same seed always produces the same incident.

Try Generator →
🤝

Dual-Agent Mode

Split observability between two agents. Observer sees logs and alerts. Responder sees metrics and dependencies. Neither can solve the incident alone — they must coordinate via share_finding.

AGENT A: Observer
• alerts, logs
share_finding
AGENT B: Responder
• metrics, deps
View Sessions →
--
Total Episodes
--
Avg Score
--
Resolution Rate
--
Best Score

🏆 Leaderboard

RankTaskScoreStepsStatus
Loading leaderboard...

Quick Start

# 1. Start an incident curl -X POST https://arijit-07-devops-incident-response.hf.space/reset \ -H "Content-Type: application/json" \ -d '{{"task_id": "easy", "seed": 42}}' # 2. Read logs (reward: +0.15) curl -X POST https://arijit-07-devops-incident-response.hf.space/step \ -H "Content-Type: application/json" \ -d '{{"action_type": "read_logs", "service": "payment-service"}}' # 3. Diagnose (reward: +0.30) curl -X POST https://arijit-07-devops-incident-response.hf.space/step \ -H "Content-Type: application/json" \ -d '{{"action_type": "diagnose", "root_cause": "memory leak in payment-service"}}' # 4. Fix it (reward: +0.40) curl -X POST https://arijit-07-devops-incident-response.hf.space/step \ -H "Content-Type: application/json" \ -d '{{"action_type": "restart_service", "service": "payment-service"}}' # Score: ~0.94 ✅
import requests BASE = "https://arijit-07-devops-incident-response.hf.space" # Start episode obs = requests.post(f"{{BASE}}/reset", json={{"task_id": "easy", "seed": 42}}).json() # Take action result = requests.post(f"{{BASE}}/step", json={{"action_type": "read_logs", "service": "payment-service"}}).json() print(f"Reward: {{result['reward']}}") # 0.15

🧠 Training Evidence

Before vs After

Base Llama-3.1-8B0.000
jumps to diagnose, gets penalized
ARIA Fine-tuned0.150
reads logs first, every time
Model weights →

Training Details

Algorithm
GRPO
Base Model
Llama-3.1-8B-Instruct
Framework
Unsloth + HuggingFace TRL
LoRA Rank
32 (alpha 64)
Episodes
160
GPU
NVIDIA L4
""" return html @app.get("/health") def health(): """ Health check endpoint. Returns a simple status object confirming the server is running. Returns: {"status": "ok", "env": "devops-incident-response", "version": "2.0.0"} """ return {"status": "ok", "env": "devops-incident-response", "version": "2.0.0"} @app.get("/generate/preview") def preview_incident(seed: int = 42): """ Preview a procedurally generated incident without starting an episode. Uses ARIA's IncidentFactory to generate a deterministic incident description from the given integer seed. Same seed always produces the same incident. Args: seed: Integer seed in range 0–99999 (default: 42) Returns: Incident object with: failure_mode, severity, affected_service, description, noise_alerts, difficulty_score """ return _factory.generate(seed) @app.post("/reset", response_model=Observation) async def reset(req: Optional[ResetRequest] = None): """ Start a new episode. Initializes the environment for the specified task and seed. Same seed always produces the same episode (deterministic). Args: task_id: One of easy/medium/hard/bonus/security/database/failover/generated seed: Integer seed for reproducibility (optional, random if not provided) Returns: Observation with: services, active_alerts, recent_logs, service_dependencies, evidence_log, sla_status, available_runbooks """ if req is None: req = ResetRequest() if req.task_id not in VALID_TASKS and req.task_id != "generated": raise HTTPException( status_code=400, detail=f"task_id must be one of {VALID_TASKS} or 'generated'. Got: {req.task_id}", ) current_episode_steps.clear() return await _env.reset(seed=req.seed, task_id=req.task_id) @app.post("/step", response_model=StepResult) async def step(action: Action): """ Take one action in the current episode. Must call /reset first. Accepts any of the 14 action types with their corresponding parameters. Returns the new observation, reward signal, and done flag. Args: action_type: One of diagnose/read_logs/read_metrics/read_runbook/ search_logs/restart_service/rollback/scale_up/ alert_oncall/acknowledge/noop/block_ip_range/ create_index/failover service: Target service name (required for most actions) root_cause: Diagnosis string (required for diagnose action) runbook: Runbook filename (required for read_runbook) version: Target version (required for rollback) reason: Reason string (required for alert_oncall) ip_range: CIDR range (required for block_ip_range) table: Table name (required for create_index) column: Column name (required for create_index) target_region: Target region (required for failover) Returns: StepResult with: observation (new state), reward (float), done (bool), info (dict) Side effects: On done=True, records the episode in the leaderboard and metrics history. """ global replay_counter if _env._logic is None: raise HTTPException(status_code=400, detail="Call /reset before /step") res = await _env.step(action) step_data = { "step": len(current_episode_steps), "action": action.dict(), "reward": res.reward, "observation_summary": { "failing_services": [s.name for s in res.observation.services if s.status in ("down", "degraded")], "alert_count": len(res.observation.active_alerts), "evidence_count": len(res.observation.evidence_log), }, } current_episode_steps.append(step_data) if res.done: track_episode(_env.state) state = _env.state replay_store[str(replay_counter)] = { "episode_id": str(replay_counter), "task_id": state.task_id, "seed": state.info.get("seed", 0), "final_score": round(state.total_reward, 3), "resolved": state.incident_resolved, "total_steps": state.step, "timestamp": datetime.utcnow().isoformat(), "steps": list(current_episode_steps), } replay_counter += 1 if len(replay_store) > 20: oldest = min(replay_store.keys(), key=int) del replay_store[oldest] return res @app.get("/state", response_model=State) def state(): """ Return the full current environment state including ground truth. Unlike /step which returns partial observations, /state reveals the ground truth root cause, fix, and full action history. Useful for evaluation and debugging. Returns: State with: all Observation fields plus ground_truth_root_cause, ground_truth_fix, incident_resolved, total_reward, action_history, episode_id, task_id, step count """ if _env._logic is None: raise HTTPException(status_code=400, detail="Call /reset before /state") return _env.state @app.get("/tasks") def list_tasks(): """ List all 8 tasks with metadata. Returns all available task IDs with their name, difficulty, max_steps, and description. Use the task_id values in POST /reset to start an episode. Returns: {"tasks": [...]} — list of 8 task objects (7 curated + 1 procedural) """ return { "tasks": [ { "id": "easy", "name": "Single Service OOM", "difficulty": "easy", "max_steps": 15, "description": "One service crash-loops from a memory leak. Which service varies by seed.", }, { "id": "medium", "name": "Cascading Multi-Service Failure", "difficulty": "medium", "max_steps": 20, "description": ( "Bad deployment causes connection pool exhaustion cascading through 3 services. " "One red-herring alert included." ), }, { "id": "hard", "name": "Silent Data Corruption", "difficulty": "hard", "max_steps": 25, "description": ( "No error-rate alerts fire. Signals are WARN-level logs and a business metric anomaly. " "Requires rollback + on-call alert for full credit." ), }, { "id": "bonus", "name": "Simultaneous Dual Failure", "difficulty": "hard", "max_steps": 25, "description": ( "Two independent failures at once: disk full on log aggregator + " "model reload CPU loop on ml-inference. Both must be fixed for full credit." ), }, { "id": "security", "name": "Security Incident (DDoS)", "difficulty": "hard", "max_steps": 20, "description": ( "A botnet is performing a DDoS and credential stuffing attack against the login endpoint. " "The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team." ), }, { "id": "database", "name": "Database Performance Degradation", "difficulty": "hard", "max_steps": 20, "description": ( "A recent migration added a user_segment column to the orders table without an index. " "Sequential table scans are spiking DB CPU. Discovered via read_metrics and the slow query log." ), }, { "id": "failover", "name": "Multi-Region Failover", "difficulty": "hard", "max_steps": 25, "description": ( "A primary datacenter region (us-east-1) is degraded due to a network partition. " "The agent must correctly identify which services support automatic multi-region failover " "and which do not. Failing over the wrong services causes severe data inconsistency penalties." ), }, { "id": "generated", "name": "Procedural Incident", "difficulty": "variable", "max_steps": 20, "description": "A seed-based procedural incident generated by ARIA. Deterministic and reproducible.", }, ] } @app.get("/validate") def validate(): """ Self-validation endpoint — runs all 7 curated tasks and returns per-task scores. Instantiates each task environment with seed=42 and runs a random agent for up to 30 steps. Verifies that: the environment runs without errors, scores stay within [0.0, 1.0], and grading completes successfully. This endpoint is safe to call at any time — it does not affect the current episode state (the active _env._logic is restored after validation). Returns: { "validation": "passed" | "failed", "summary": "X/Y tasks passed validation", "total_tasks": N, "passed": N, "tasks": [ { "task_id": "easy", "score": 0.12, "in_range": true, "resolved": false, "steps": 15, "status": "ok" }, ... ] } """ import random from graders.grader import grade_episode results = [] old_logic = _env._logic for task_id in VALID_TASKS: try: import asyncio from env import DevOpsIncidentEnv as LogicClass env_logic = LogicClass(task_id=task_id, seed=42) env_logic.reset() done = False steps = 0 import random as _random while not done and steps < 30: action = Action(action_type=_random.choice(list(ActionType))) result = env_logic.step(action) done = result.done steps += 1 s = env_logic.state() score = grade_episode( task_id, s.action_history, s.ground_truth_root_cause, s.ground_truth_fix, s.incident_resolved, s.total_reward, ) results.append({ "task_id": task_id, "score": round(float(score), 4), "in_range": 0.0 <= score <= 1.0, "resolved": s.incident_resolved, "steps": steps, "status": "ok", }) except Exception as e: results.append({"task_id": task_id, "status": "error", "error": str(e)}) _env._logic = old_logic passed_count = sum(1 for r in results if r.get("status") == "ok" and r.get("in_range")) total_count = len(results) all_ok = passed_count == total_count details = {} for r in results: details[r["task_id"]] = { "status": "passed" if r.get("status") == "ok" and r.get("in_range") else r.get("status", "failed"), "score": r.get("score"), "resolved": r.get("resolved") } return { "validation": "passed" if all_ok else "failed", "summary": f"{passed_count}/{total_count} tasks passed validation", "tasks_checked": total_count, "tasks_passed": passed_count, "details": details, "environment": "devops-incident-response", "version": "2.0.0", "note": "Generated task excluded — procedural tasks require fixed parameters" } @app.get("/metrics") def get_metrics(): """ Aggregate episode statistics across all completed episodes. Statistics are computed in-memory and reset when the server restarts. Returns: { "total_episodes": N, "overall_avg_score": 0.XX, "by_task": { "easy": {"count", "avg_score", "max_score", "min_score", "resolution_rate", "avg_steps_to_diagnosis", "avg_info_gathering_ratio"}, ... }, "last_updated": "ISO timestamp" } """ total_episodes = len(episode_history) by_task = {} total_score = 0.0 if total_episodes == 0: return { "total_episodes": 0, "by_task": {}, "overall_avg_score": 0.0, "last_updated": datetime.utcnow().isoformat() + "Z" } for rec in episode_history: tid = rec["task_id"] if tid not in by_task: by_task[tid] = {"scores": [], "resolved": 0, "steps_to_diag": [], "info_ratios": []} by_task[tid]["scores"].append(rec["final_score"]) if rec["incident_resolved"]: by_task[tid]["resolved"] += 1 if rec["steps_to_diagnosis"] is not None: by_task[tid]["steps_to_diag"].append(rec["steps_to_diagnosis"]) by_task[tid]["info_ratios"].append(rec["info_gathering_ratio"]) total_score += rec["final_score"] out_by_task = {} for tid, agg in by_task.items(): cnt = len(agg["scores"]) out_by_task[tid] = { "count": cnt, "avg_score": round(sum(agg["scores"]) / cnt, 3), "max_score": round(max(agg["scores"]), 3), "min_score": round(min(agg["scores"]), 3), "resolution_rate": round(agg["resolved"] / cnt, 3), "avg_steps_to_diagnosis": round(sum(agg["steps_to_diag"]) / len(agg["steps_to_diag"]), 1) if agg["steps_to_diag"] else None, "avg_info_gathering_ratio": round(sum(agg["info_ratios"]) / len(agg["info_ratios"]), 2) if agg["info_ratios"] else 0.0 } return { "total_episodes": total_episodes, "by_task": out_by_task, "overall_avg_score": round(total_score / total_episodes, 3), "last_updated": datetime.utcnow().isoformat() + "Z" } @app.get("/leaderboard") def get_leaderboard(): """ Top-10 episodes ranked by score (ties broken by fewer steps). Returns: {"leaderboard": [{"rank", "task_id", "score", "steps", "timestamp"}, ...]} """ sorted_eps = sorted(episode_history, key=lambda x: (x["final_score"], -x["steps_taken"]), reverse=True) top_10 = [] for i, rec in enumerate(sorted_eps[:10]): top_10.append({ "rank": i + 1, "task_id": rec["task_id"], "score": rec["final_score"], "steps": rec["steps_taken"], "timestamp": rec["timestamp"] }) return {"leaderboard": top_10} @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): await websocket.accept() # Independent environment instance for this connection ws_env = DevOpsEnvironment() try: while True: data = await websocket.receive_json() command = data.get("command") print(f"WebSocket received: {data}") if command == "reset": task_id = data.get("task_id", "easy") seed = data.get("seed") obs = await ws_env.reset(seed=seed, task_id=task_id) await websocket.send_json({ "type": "observation", "data": obs.model_dump() if hasattr(obs, "model_dump") else obs.dict() }) elif command == "step": if ws_env._logic is None: await websocket.send_json({ "type": "error", "message": "Call reset before step" }) continue action_data = data.get("action", {}) try: action = Action(**action_data) step_result = await ws_env.step(action) if step_result.done: track_episode(ws_env.state) await websocket.send_json({ "type": "step_result", "data": { "observation": step_result.observation.model_dump() if hasattr(step_result.observation, "model_dump") else step_result.observation.dict(), "reward": step_result.reward, "done": step_result.done, "info": step_result.info } }) except Exception as e: await websocket.send_json({ "type": "error", "message": str(e) }) elif command == "state": if ws_env._logic is None: await websocket.send_json({ "type": "error", "message": "Call reset before state" }) continue state = ws_env.state await websocket.send_json({ "type": "state", "data": state.model_dump() if hasattr(state, "model_dump") else state.dict() }) else: await websocket.send_json({ "type": "error", "message": f"Unrecognized command: {command}" }) except WebSocketDisconnect: print("WebSocket client disconnected") except Exception as e: print(f"WebSocket error: {e}") try: await websocket.send_json({ "type": "error", "message": str(e) }) except: pass await websocket.close() # ─── Multi-Agent Routes ──────────────────────────────────────────────────────── @app.post("/multi-agent/reset") def multi_agent_reset(body: MultiAgentResetRequest): """ Start a new dual-agent session with split observability. Creates two views of the same incident: - Agent A (Observer): sees logs and active alerts only - Agent B (Responder): sees metrics and service dependencies only Args: task_id: Task to run (same valid values as POST /reset) seed: Deterministic seed (default: 42) Returns: session_id, agent roles, step instructions, and initial observations for both agents. """ session = DualAgentSession(task_id=body.task_id, seed=body.seed) multi_agent_sessions[session.session_id] = session return { "session_id": session.session_id, "task_id": body.task_id, "seed": body.seed, "agent_a_role": "observer — sees logs and alerts only", "agent_b_role": "responder — sees metrics and dependencies only", "instructions": { "agent_a": "POST /multi-agent/step/a/{session_id} body: {\"finding\": \"your observation\"}", "agent_b": "POST /multi-agent/step/b/{session_id} body: Action JSON (same schema as POST /step)", }, "observation_a": session.get_observation_a(), "observation_b": session.get_observation_b(), } @app.post("/multi-agent/step/a/{session_id}") def multi_agent_step_a(session_id: str, body: AgentAStepRequest): """ Agent A (Observer) shares a finding with Agent B. Agent A sees logs and alerts only. Findings are appended to the shared findings log that Agent B can see when deciding its next action. Args: session_id: Session ID from POST /multi-agent/reset finding: Text description of what Agent A observed Returns: Updated findings log and current Observer-view observation. """ session = multi_agent_sessions.get(session_id) if not session: raise HTTPException(status_code=404, detail="Session not found") return session.step_a(body.finding) @app.post("/multi-agent/step/b/{session_id}") def multi_agent_step_b(session_id: str, body: Action): """ Agent B (Responder) takes an action in the environment. Agent B sees metrics and service dependencies. It receives all findings shared by Agent A, then executes an action. Action schema is identical to POST /step. Args: session_id: Session ID from POST /multi-agent/reset body: Action object (same schema as POST /step) Returns: StepResult with reward, done flag, and updated Responder-view observation. """ session = multi_agent_sessions.get(session_id) if not session: raise HTTPException(status_code=404, detail="Session not found") return session.step_b(body) @app.get("/multi-agent/state/{session_id}") def multi_agent_state(session_id: str): """ Full state for a dual-agent session including both agent perspectives. Returns: Session state with findings_log, step count, done flag, and both Observer and Responder observations. """ session = multi_agent_sessions.get(session_id) if not session: raise HTTPException(status_code=404, detail="Session not found") return session.get_state() @app.get("/multi-agent/sessions") def list_multi_agent_sessions(): """ List all active dual-agent sessions. Returns: List of active sessions with session_id, task_id, current step, done flag, and number of findings shared by Agent A. """ return [ { "session_id": s.session_id, "task_id": s.task_id, "step": s.step_count, "done": s.done, "findings_count": len(s.findings_log), } for s in multi_agent_sessions.values() ] # ─── Curriculum Routes ───────────────────────────────────────────────────────── @app.get("/curriculum/status") def get_curriculum_status(): """ Agent mastery levels across all tasks. Returns the curriculum engine's current view of agent performance: rolling average score, mastery level (0–3), whether scaffolding is needed, and a diagnostic hint per task. Returns: {"tasks": {"easy": {"rolling_avg", "mastery_level", "scaffold_needed", "hint"}, ...}, "recommended_task": "easy"} """ return curriculum_engine.get_status() @app.get("/curriculum/next") def get_next_curriculum_task(): """ Recommended next task for adaptive training. Returns the task with the lowest rolling average score among non-mastered tasks. Training loops should call this between episodes to implement curriculum learning automatically. Returns: {"recommended_task": "medium", "reasoning": "..."} """ return { "recommended_task": curriculum_engine.get_next_curriculum_task(), "reasoning": "Lowest rolling average among non-mastered tasks.", } @app.post("/curriculum/record") def record_curriculum_episode(req: CurriculumRecordRequest): """ Record an episode result to update the curriculum engine. Training loops should call this after each episode to keep the curriculum engine's rolling averages and mastery levels current. Args: task_id: Task that was just run score: Episode score (float, typically 0.0–1.0) Returns: {"recorded": true, "new_status": {...}} — updated task status """ try: curriculum_engine.record_episode(req.task_id, req.score) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) return { "recorded": True, "new_status": curriculum_engine.get_status()["tasks"][req.task_id], } @app.get("/curriculum/hint/{task_id}") def get_curriculum_hint(task_id: str): """ Get a diagnostic hint and scaffold flag for a specific task. If an agent is repeatedly failing a task, this returns a structured hint explaining what the agent should try (e.g., "read logs before acting"). Args: task_id: One of easy/medium/hard/bonus/security/database/failover Returns: {"task_id", "hint", "scaffold_needed": bool, "mastery_level": 0–3} """ try: return { "task_id": task_id, "hint": curriculum_engine.get_hint(task_id), "scaffold_needed": curriculum_engine.should_scaffold(task_id), "mastery_level": curriculum_engine.get_mastery(task_id), } except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) # ─── Feature 1: Episode Replay System ──────────────────────────────────────── @app.get("/replays") def list_replays(): """List available episode replays, newest first.""" items = [] for ep_id, r in replay_store.items(): items.append({ "episode_id": r["episode_id"], "task_id": r["task_id"], "score": r["final_score"], "resolved": r["resolved"], "total_steps": r["total_steps"], "timestamp": r["timestamp"], }) items.sort(key=lambda x: int(x["episode_id"]), reverse=True) return items @app.get("/replay/{episode_id}") def get_replay(episode_id: str): """Return full replay data for an episode.""" if episode_id not in replay_store: raise HTTPException(status_code=404, detail="Episode not found") return replay_store[episode_id] @app.get("/replay/{episode_id}/html", response_class=HTMLResponse) def get_replay_html(episode_id: str): """Return an HTML timeline visualization of an episode replay.""" if episode_id not in replay_store: raise HTTPException(status_code=404, detail="Episode not found") r = replay_store[episode_id] def reward_color(reward: float) -> str: if reward > 0.2: return "#00ff88" if reward > 0: return "#ffaa00" if reward == 0: return "#ff3355" return "#ff3355" def reward_bg(reward: float) -> str: if reward > 0.2: return "rgba(0,255,136,0.08)" if reward > 0: return "rgba(255,170,0,0.08)" return "rgba(255,51,85,0.08)" steps_html = "" running_score = 0.0 for s in r["steps"]: running_score += s["reward"] act = s["action"] act_type = act.get("action_type", "unknown") service = act.get("service") or "" rc = act.get("root_cause") or "" obs = s["observation_summary"] col = reward_color(s["reward"]) bg = reward_bg(s["reward"]) reward_sign = "+" if s["reward"] > 0 else "" failing = ", ".join(obs["failing_services"]) if obs["failing_services"] else "none" steps_html += f"""
STEP {s['step'] + 1} {act_type} {service} {reward_sign}{s['reward']:.3f} ∑ {running_score:.3f}
{"
→ " + rc + "
" if rc and act_type == "diagnose" else ""}
failing: {failing}  |  alerts: {obs['alert_count']}  |  evidence: {obs['evidence_count']}
""" resolved_color = "#00ff88" if r["resolved"] else "#ff3355" resolved_text = "INCIDENT RESOLVED" if r["resolved"] else "INCIDENT ESCALATED" resolved_icon = "✓" if r["resolved"] else "✗" score_col = "#00ff88" if r["final_score"] >= 0.7 else "#ffaa00" if r["final_score"] >= 0.4 else "#ff3355" html = f""" ARIA Replay #{episode_id}
▣ Episode Replay #{episode_id}
TASK {r['task_id'].upper()}
SEED {r['seed']}
SCORE {r['final_score']:.3f}
STEPS {r['total_steps']}
TIME {r['timestamp'][:19]}

Step Timeline

{steps_html if steps_html else '
No steps recorded.
'}
{resolved_icon} {resolved_text}  |  Final Score: {r['final_score']:.3f}
""" return HTMLResponse(html) @app.get("/replays/html", response_class=HTMLResponse) def list_replays_html(): """HTML index of all replays.""" items = sorted(replay_store.values(), key=lambda x: int(x["episode_id"]), reverse=True) rows = "" for r in items: score_col = "#00ff88" if r["final_score"] >= 0.7 else "#ffaa00" if r["final_score"] >= 0.4 else "#ff3355" resolved_icon = "✓" if r["resolved"] else "✗" rows += f""" #{r['episode_id']} {r['task_id'].upper()} {r['final_score']:.3f} {resolved_icon} {r['total_steps']} {r['timestamp'][:19]} """ if not rows: rows = 'No replays yet. Complete an episode to generate one.' html = f""" ARIA Episode Replays

▣ Episode Replays

{rows}
IDTaskScoreResolvedStepsTime
""" return HTMLResponse(html) # ─── Feature 2: Human vs Agent Challenge ───────────────────────────────────── @app.get("/challenge", response_class=HTMLResponse) async def challenge_page(): """Human-playable incident response challenge.""" html = """ ARIA Human Challenge
⚠ Note: This resets the shared environment — agent runs will be interrupted while you play
► Infrastructure Status
ServiceStatusCPU%Mem%Err/sP99ms
Loading...
► Active Alerts
No alerts
► Evidence Gathered
No evidence yet
Episode Progress
0.000
Step 0 / 15
EASY
► Take Action
► Reward History
""" return HTMLResponse(html) # ─── Feature 3: Progress Visualization ─────────────────────────────────────── @app.get("/progress", response_class=HTMLResponse) def progress_page(): """Training progress and live performance visualization.""" metrics = get_metrics() by_task = metrics.get("by_task", {}) BASELINES = { "easy": 0.05, "medium": 0.03, "hard": 0.01, "bonus": 0.01, "security": 0.01, "database": 0.01, "failover": 0.01, } TRAINING_RESULTS = [ ("easy", 0.320, 0.685), ("medium", 0.050, 0.378), ("hard", 0.190, 0.869), ("bonus", 0.152, 0.682), ] ALL_TASKS = ["easy", "medium", "hard", "bonus", "security", "database", "failover"] def bar_row(task: str, score: float, baseline: float) -> str: pct = round(score * 100, 1) bpct = round(baseline * 100, 1) col = "#00ff88" if score >= 0.7 else "#ffaa00" if score >= 0.4 else "#ff3355" return f"""
{task.upper()}
RANDOM
{baseline:.3f}
ARIA
{score:.3f}
""" bars = "".join( bar_row(task, by_task.get(task, {}).get("avg_score", 0.0), BASELINES.get(task, 0.01)) for task in ALL_TASKS ) training_rows = "".join( f""" {task.upper()} {base:.3f} {finetuned:.3f} +{finetuned - base:.3f} ✓ """ for task, base, finetuned in TRAINING_RESULTS ) recent = list(episode_history)[-20:] if episode_history else [] recent_scores_js = str([round(r["final_score"], 3) for r in recent]) total_eps = metrics.get("total_episodes", 0) overall = metrics.get("overall_avg_score", 0.0) overall_col = "#00ff88" if overall >= 0.7 else "#ffaa00" if overall >= 0.4 else "#4d9fff" html = f""" ARIA Progress

Training Progress & Live Performance

Llama-3.1-8B fine-tuned with GRPO on 7 task types · 14 actions · Dense reward shaping
Live Episodes
{total_eps}
Avg Live Score
{overall:.3f}
Training Episodes
160
Model
8B GRPO

Section 1 — Live Task Performance vs Random Baseline

{bars}

Section 2 — Training Improvement (Llama-3.1-8B, GRPO)

{training_rows}
TaskBaselineFine-tunedImprovement

Section 3 — Live Episode Score Timeline

Last 20 episodes · updates every 30s
""" return HTMLResponse(html)