Spaces:
Sleeping
Sleeping
Upload server/app.py with huggingface_hub
Browse files- server/app.py +309 -6
server/app.py
CHANGED
|
@@ -543,16 +543,110 @@ curl -X POST .../multi-agent/step/b/{{id}} \\
|
|
| 543 |
|
| 544 |
@app.get("/health")
|
| 545 |
def health():
|
| 546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
|
| 548 |
|
| 549 |
@app.get("/generate/preview")
|
| 550 |
def preview_incident(seed: int = 42):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
return _factory.generate(seed)
|
| 552 |
|
| 553 |
|
| 554 |
@app.post("/reset", response_model=Observation)
|
| 555 |
async def reset(req: Optional[ResetRequest] = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
if req is None:
|
| 557 |
req = ResetRequest()
|
| 558 |
if req.task_id not in VALID_TASKS and req.task_id != "generated":
|
|
@@ -565,6 +659,34 @@ async def reset(req: Optional[ResetRequest] = None):
|
|
| 565 |
|
| 566 |
@app.post("/step", response_model=StepResult)
|
| 567 |
async def step(action: Action):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 568 |
if _env._logic is None:
|
| 569 |
raise HTTPException(status_code=400, detail="Call /reset before /step")
|
| 570 |
res = await _env.step(action)
|
|
@@ -575,6 +697,18 @@ async def step(action: Action):
|
|
| 575 |
|
| 576 |
@app.get("/state", response_model=State)
|
| 577 |
def state():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
if _env._logic is None:
|
| 579 |
raise HTTPException(status_code=400, detail="Call /reset before /state")
|
| 580 |
return _env.state
|
|
@@ -582,6 +716,15 @@ def state():
|
|
| 582 |
|
| 583 |
@app.get("/tasks")
|
| 584 |
def list_tasks():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
return {
|
| 586 |
"tasks": [
|
| 587 |
{
|
|
@@ -665,15 +808,41 @@ def list_tasks():
|
|
| 665 |
|
| 666 |
@app.get("/validate")
|
| 667 |
def validate():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
import random
|
| 669 |
from graders.grader import grade_episode
|
| 670 |
results = []
|
| 671 |
-
# Temporarily save existing _logic
|
| 672 |
old_logic = _env._logic
|
| 673 |
for task_id in VALID_TASKS:
|
| 674 |
try:
|
| 675 |
import asyncio
|
| 676 |
-
# Wait! Since we are in a sync endpoint, validating by instantiating the logic directly
|
| 677 |
from env import DevOpsIncidentEnv as LogicClass
|
| 678 |
env_logic = LogicClass(task_id=task_id, seed=42)
|
| 679 |
env_logic.reset()
|
|
@@ -692,7 +861,7 @@ def validate():
|
|
| 692 |
)
|
| 693 |
results.append({
|
| 694 |
"task_id": task_id,
|
| 695 |
-
"score": score,
|
| 696 |
"in_range": 0.0 <= score <= 1.0,
|
| 697 |
"resolved": s.incident_resolved,
|
| 698 |
"steps": steps,
|
|
@@ -702,12 +871,38 @@ def validate():
|
|
| 702 |
results.append({"task_id": task_id, "status": "error", "error": str(e)})
|
| 703 |
|
| 704 |
_env._logic = old_logic
|
| 705 |
-
|
| 706 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 707 |
|
| 708 |
|
| 709 |
@app.get("/metrics")
|
| 710 |
def get_metrics():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
total_episodes = len(episode_history)
|
| 712 |
by_task = {}
|
| 713 |
total_score = 0.0
|
|
@@ -756,6 +951,12 @@ def get_metrics():
|
|
| 756 |
|
| 757 |
@app.get("/leaderboard")
|
| 758 |
def get_leaderboard():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
sorted_eps = sorted(episode_history, key=lambda x: (x["final_score"], -x["steps_taken"]), reverse=True)
|
| 760 |
top_10 = []
|
| 761 |
for i, rec in enumerate(sorted_eps[:10]):
|
|
@@ -858,6 +1059,21 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 858 |
|
| 859 |
@app.post("/multi-agent/reset")
|
| 860 |
def multi_agent_reset(body: MultiAgentResetRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
session = DualAgentSession(task_id=body.task_id, seed=body.seed)
|
| 862 |
multi_agent_sessions[session.session_id] = session
|
| 863 |
return {
|
|
@@ -877,6 +1093,19 @@ def multi_agent_reset(body: MultiAgentResetRequest):
|
|
| 877 |
|
| 878 |
@app.post("/multi-agent/step/a/{session_id}")
|
| 879 |
def multi_agent_step_a(session_id: str, body: AgentAStepRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 880 |
session = multi_agent_sessions.get(session_id)
|
| 881 |
if not session:
|
| 882 |
raise HTTPException(status_code=404, detail="Session not found")
|
|
@@ -885,6 +1114,20 @@ def multi_agent_step_a(session_id: str, body: AgentAStepRequest):
|
|
| 885 |
|
| 886 |
@app.post("/multi-agent/step/b/{session_id}")
|
| 887 |
def multi_agent_step_b(session_id: str, body: Action):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 888 |
session = multi_agent_sessions.get(session_id)
|
| 889 |
if not session:
|
| 890 |
raise HTTPException(status_code=404, detail="Session not found")
|
|
@@ -893,6 +1136,13 @@ def multi_agent_step_b(session_id: str, body: Action):
|
|
| 893 |
|
| 894 |
@app.get("/multi-agent/state/{session_id}")
|
| 895 |
def multi_agent_state(session_id: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
session = multi_agent_sessions.get(session_id)
|
| 897 |
if not session:
|
| 898 |
raise HTTPException(status_code=404, detail="Session not found")
|
|
@@ -901,6 +1151,13 @@ def multi_agent_state(session_id: str):
|
|
| 901 |
|
| 902 |
@app.get("/multi-agent/sessions")
|
| 903 |
def list_multi_agent_sessions():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 904 |
return [
|
| 905 |
{
|
| 906 |
"session_id": s.session_id,
|
|
@@ -917,11 +1174,32 @@ def list_multi_agent_sessions():
|
|
| 917 |
|
| 918 |
@app.get("/curriculum/status")
|
| 919 |
def get_curriculum_status():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
return curriculum_engine.get_status()
|
| 921 |
|
| 922 |
|
| 923 |
@app.get("/curriculum/next")
|
| 924 |
def get_next_curriculum_task():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
return {
|
| 926 |
"recommended_task": curriculum_engine.get_next_curriculum_task(),
|
| 927 |
"reasoning": "Lowest rolling average among non-mastered tasks.",
|
|
@@ -930,6 +1208,19 @@ def get_next_curriculum_task():
|
|
| 930 |
|
| 931 |
@app.post("/curriculum/record")
|
| 932 |
def record_curriculum_episode(req: CurriculumRecordRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
try:
|
| 934 |
curriculum_engine.record_episode(req.task_id, req.score)
|
| 935 |
except ValueError as exc:
|
|
@@ -942,6 +1233,18 @@ def record_curriculum_episode(req: CurriculumRecordRequest):
|
|
| 942 |
|
| 943 |
@app.get("/curriculum/hint/{task_id}")
|
| 944 |
def get_curriculum_hint(task_id: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
try:
|
| 946 |
return {
|
| 947 |
"task_id": task_id,
|
|
|
|
| 543 |
|
| 544 |
@app.get("/health")
|
| 545 |
def health():
|
| 546 |
+
"""
|
| 547 |
+
Health check endpoint.
|
| 548 |
+
|
| 549 |
+
Returns a simple status object confirming the server is running.
|
| 550 |
+
|
| 551 |
+
Returns:
|
| 552 |
+
{"status": "ok", "env": "devops-incident-response", "version": "2.0.0"}
|
| 553 |
+
"""
|
| 554 |
+
return {"status": "ok", "env": "devops-incident-response", "version": "2.0.0"}
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
@app.get("/about")
|
| 558 |
+
def about():
|
| 559 |
+
"""
|
| 560 |
+
Full environment metadata for LLM judges and researchers.
|
| 561 |
+
|
| 562 |
+
Returns a comprehensive description of the ARIA environment including
|
| 563 |
+
task count, action types, feature flags, training metadata, reward
|
| 564 |
+
design philosophy, and links to the live space, trained model, and docs.
|
| 565 |
+
|
| 566 |
+
Returns:
|
| 567 |
+
JSON object with name, version, description, themes, task/action counts,
|
| 568 |
+
feature descriptions, training info, reward design, and links.
|
| 569 |
+
"""
|
| 570 |
+
return {
|
| 571 |
+
"name": "ARIA — DevOps Incident Response",
|
| 572 |
+
"version": "2.0.0",
|
| 573 |
+
"description": (
|
| 574 |
+
"OpenEnv-compliant RL environment for production incident response. "
|
| 575 |
+
"AI agents diagnose and remediate software incidents across 7 task types "
|
| 576 |
+
"using 14 actions with dense reward shaping."
|
| 577 |
+
),
|
| 578 |
+
"themes": [
|
| 579 |
+
"World Modeling: Professional Tasks",
|
| 580 |
+
"Self-Improvement",
|
| 581 |
+
"Multi-Agent Interactions",
|
| 582 |
+
],
|
| 583 |
+
"tasks": 8,
|
| 584 |
+
"action_types": 14,
|
| 585 |
+
"features": {
|
| 586 |
+
"curriculum_engine": "Adaptive difficulty based on agent performance",
|
| 587 |
+
"incident_generator": "Procedural incidents from seeds (0-99999)",
|
| 588 |
+
"dual_agent_mode": "Split observability — Observer + Responder",
|
| 589 |
+
},
|
| 590 |
+
"training": {
|
| 591 |
+
"model": "Llama-3.2-3B-Instruct",
|
| 592 |
+
"algorithm": "GRPO",
|
| 593 |
+
"framework": "HuggingFace TRL + Unsloth",
|
| 594 |
+
"episodes": 140,
|
| 595 |
+
"adapter_url": "https://huggingface.co/Arijit-07/aria-devops-llama3b",
|
| 596 |
+
},
|
| 597 |
+
"reward_design": {
|
| 598 |
+
"type": "dense",
|
| 599 |
+
"range": [0.001, 0.999],
|
| 600 |
+
"anti_gaming": [
|
| 601 |
+
"collateral_damage_penalty",
|
| 602 |
+
"blind_remediation_penalty",
|
| 603 |
+
"semantic_diagnosis_matching",
|
| 604 |
+
],
|
| 605 |
+
"efficiency_bonus": True,
|
| 606 |
+
},
|
| 607 |
+
"links": {
|
| 608 |
+
"space": "https://arijit-07-devops-incident-response.hf.space",
|
| 609 |
+
"model": "https://huggingface.co/Arijit-07/aria-devops-llama3b",
|
| 610 |
+
"github": "https://github.com/Twilight-13/devops-incident-response",
|
| 611 |
+
"docs": "https://arijit-07-devops-incident-response.hf.space/docs",
|
| 612 |
+
},
|
| 613 |
+
}
|
| 614 |
|
| 615 |
|
| 616 |
@app.get("/generate/preview")
|
| 617 |
def preview_incident(seed: int = 42):
|
| 618 |
+
"""
|
| 619 |
+
Preview a procedurally generated incident without starting an episode.
|
| 620 |
+
|
| 621 |
+
Uses ARIA's IncidentFactory to generate a deterministic incident description
|
| 622 |
+
from the given integer seed. Same seed always produces the same incident.
|
| 623 |
+
|
| 624 |
+
Args:
|
| 625 |
+
seed: Integer seed in range 0–99999 (default: 42)
|
| 626 |
+
|
| 627 |
+
Returns:
|
| 628 |
+
Incident object with: failure_mode, severity, affected_service,
|
| 629 |
+
description, noise_alerts, difficulty_score
|
| 630 |
+
"""
|
| 631 |
return _factory.generate(seed)
|
| 632 |
|
| 633 |
|
| 634 |
@app.post("/reset", response_model=Observation)
|
| 635 |
async def reset(req: Optional[ResetRequest] = None):
|
| 636 |
+
"""
|
| 637 |
+
Start a new episode.
|
| 638 |
+
|
| 639 |
+
Initializes the environment for the specified task and seed.
|
| 640 |
+
Same seed always produces the same episode (deterministic).
|
| 641 |
+
|
| 642 |
+
Args:
|
| 643 |
+
task_id: One of easy/medium/hard/bonus/security/database/failover/generated
|
| 644 |
+
seed: Integer seed for reproducibility (optional, random if not provided)
|
| 645 |
+
|
| 646 |
+
Returns:
|
| 647 |
+
Observation with: services, active_alerts, recent_logs,
|
| 648 |
+
service_dependencies, evidence_log, sla_status, available_runbooks
|
| 649 |
+
"""
|
| 650 |
if req is None:
|
| 651 |
req = ResetRequest()
|
| 652 |
if req.task_id not in VALID_TASKS and req.task_id != "generated":
|
|
|
|
| 659 |
|
| 660 |
@app.post("/step", response_model=StepResult)
|
| 661 |
async def step(action: Action):
|
| 662 |
+
"""
|
| 663 |
+
Take one action in the current episode.
|
| 664 |
+
|
| 665 |
+
Must call /reset first. Accepts any of the 14 action types with their
|
| 666 |
+
corresponding parameters. Returns the new observation, reward signal,
|
| 667 |
+
and done flag.
|
| 668 |
+
|
| 669 |
+
Args:
|
| 670 |
+
action_type: One of diagnose/read_logs/read_metrics/read_runbook/
|
| 671 |
+
search_logs/restart_service/rollback/scale_up/
|
| 672 |
+
alert_oncall/acknowledge/noop/block_ip_range/
|
| 673 |
+
create_index/failover
|
| 674 |
+
service: Target service name (required for most actions)
|
| 675 |
+
root_cause: Diagnosis string (required for diagnose action)
|
| 676 |
+
runbook: Runbook filename (required for read_runbook)
|
| 677 |
+
version: Target version (required for rollback)
|
| 678 |
+
reason: Reason string (required for alert_oncall)
|
| 679 |
+
ip_range: CIDR range (required for block_ip_range)
|
| 680 |
+
table: Table name (required for create_index)
|
| 681 |
+
column: Column name (required for create_index)
|
| 682 |
+
target_region: Target region (required for failover)
|
| 683 |
+
|
| 684 |
+
Returns:
|
| 685 |
+
StepResult with: observation (new state), reward (float), done (bool), info (dict)
|
| 686 |
+
|
| 687 |
+
Side effects:
|
| 688 |
+
On done=True, records the episode in the leaderboard and metrics history.
|
| 689 |
+
"""
|
| 690 |
if _env._logic is None:
|
| 691 |
raise HTTPException(status_code=400, detail="Call /reset before /step")
|
| 692 |
res = await _env.step(action)
|
|
|
|
| 697 |
|
| 698 |
@app.get("/state", response_model=State)
|
| 699 |
def state():
|
| 700 |
+
"""
|
| 701 |
+
Return the full current environment state including ground truth.
|
| 702 |
+
|
| 703 |
+
Unlike /step which returns partial observations, /state reveals the
|
| 704 |
+
ground truth root cause, fix, and full action history. Useful for
|
| 705 |
+
evaluation and debugging.
|
| 706 |
+
|
| 707 |
+
Returns:
|
| 708 |
+
State with: all Observation fields plus ground_truth_root_cause,
|
| 709 |
+
ground_truth_fix, incident_resolved, total_reward, action_history,
|
| 710 |
+
episode_id, task_id, step count
|
| 711 |
+
"""
|
| 712 |
if _env._logic is None:
|
| 713 |
raise HTTPException(status_code=400, detail="Call /reset before /state")
|
| 714 |
return _env.state
|
|
|
|
| 716 |
|
| 717 |
@app.get("/tasks")
|
| 718 |
def list_tasks():
|
| 719 |
+
"""
|
| 720 |
+
List all 8 tasks with metadata.
|
| 721 |
+
|
| 722 |
+
Returns all available task IDs with their name, difficulty, max_steps,
|
| 723 |
+
and description. Use the task_id values in POST /reset to start an episode.
|
| 724 |
+
|
| 725 |
+
Returns:
|
| 726 |
+
{"tasks": [...]} — list of 8 task objects (7 curated + 1 procedural)
|
| 727 |
+
"""
|
| 728 |
return {
|
| 729 |
"tasks": [
|
| 730 |
{
|
|
|
|
| 808 |
|
| 809 |
@app.get("/validate")
|
| 810 |
def validate():
|
| 811 |
+
"""
|
| 812 |
+
Self-validation endpoint — runs all 7 curated tasks and returns per-task scores.
|
| 813 |
+
|
| 814 |
+
Instantiates each task environment with seed=42 and runs a random agent
|
| 815 |
+
for up to 30 steps. Verifies that: the environment runs without errors,
|
| 816 |
+
scores stay within [0.0, 1.0], and grading completes successfully.
|
| 817 |
+
|
| 818 |
+
This endpoint is safe to call at any time — it does not affect the current
|
| 819 |
+
episode state (the active _env._logic is restored after validation).
|
| 820 |
+
|
| 821 |
+
Returns:
|
| 822 |
+
{
|
| 823 |
+
"validation": "passed" | "failed",
|
| 824 |
+
"summary": "X/Y tasks passed validation",
|
| 825 |
+
"total_tasks": N,
|
| 826 |
+
"passed": N,
|
| 827 |
+
"tasks": [
|
| 828 |
+
{
|
| 829 |
+
"task_id": "easy",
|
| 830 |
+
"score": 0.12,
|
| 831 |
+
"in_range": true,
|
| 832 |
+
"resolved": false,
|
| 833 |
+
"steps": 15,
|
| 834 |
+
"status": "ok"
|
| 835 |
+
}, ...
|
| 836 |
+
]
|
| 837 |
+
}
|
| 838 |
+
"""
|
| 839 |
import random
|
| 840 |
from graders.grader import grade_episode
|
| 841 |
results = []
|
|
|
|
| 842 |
old_logic = _env._logic
|
| 843 |
for task_id in VALID_TASKS:
|
| 844 |
try:
|
| 845 |
import asyncio
|
|
|
|
| 846 |
from env import DevOpsIncidentEnv as LogicClass
|
| 847 |
env_logic = LogicClass(task_id=task_id, seed=42)
|
| 848 |
env_logic.reset()
|
|
|
|
| 861 |
)
|
| 862 |
results.append({
|
| 863 |
"task_id": task_id,
|
| 864 |
+
"score": round(float(score), 4),
|
| 865 |
"in_range": 0.0 <= score <= 1.0,
|
| 866 |
"resolved": s.incident_resolved,
|
| 867 |
"steps": steps,
|
|
|
|
| 871 |
results.append({"task_id": task_id, "status": "error", "error": str(e)})
|
| 872 |
|
| 873 |
_env._logic = old_logic
|
| 874 |
+
passed_count = sum(1 for r in results if r.get("status") == "ok" and r.get("in_range"))
|
| 875 |
+
total_count = len(results)
|
| 876 |
+
all_ok = passed_count == total_count
|
| 877 |
+
return {
|
| 878 |
+
"validation": "passed" if all_ok else "failed",
|
| 879 |
+
"summary": f"{passed_count}/{total_count} tasks passed validation",
|
| 880 |
+
"total_tasks": total_count,
|
| 881 |
+
"passed": passed_count,
|
| 882 |
+
"tasks": results,
|
| 883 |
+
}
|
| 884 |
|
| 885 |
|
| 886 |
@app.get("/metrics")
|
| 887 |
def get_metrics():
|
| 888 |
+
"""
|
| 889 |
+
Aggregate episode statistics across all completed episodes.
|
| 890 |
+
|
| 891 |
+
Statistics are computed in-memory and reset when the server restarts.
|
| 892 |
+
|
| 893 |
+
Returns:
|
| 894 |
+
{
|
| 895 |
+
"total_episodes": N,
|
| 896 |
+
"overall_avg_score": 0.XX,
|
| 897 |
+
"by_task": {
|
| 898 |
+
"easy": {"count", "avg_score", "max_score", "min_score",
|
| 899 |
+
"resolution_rate", "avg_steps_to_diagnosis",
|
| 900 |
+
"avg_info_gathering_ratio"},
|
| 901 |
+
...
|
| 902 |
+
},
|
| 903 |
+
"last_updated": "ISO timestamp"
|
| 904 |
+
}
|
| 905 |
+
"""
|
| 906 |
total_episodes = len(episode_history)
|
| 907 |
by_task = {}
|
| 908 |
total_score = 0.0
|
|
|
|
| 951 |
|
| 952 |
@app.get("/leaderboard")
|
| 953 |
def get_leaderboard():
|
| 954 |
+
"""
|
| 955 |
+
Top-10 episodes ranked by score (ties broken by fewer steps).
|
| 956 |
+
|
| 957 |
+
Returns:
|
| 958 |
+
{"leaderboard": [{"rank", "task_id", "score", "steps", "timestamp"}, ...]}
|
| 959 |
+
"""
|
| 960 |
sorted_eps = sorted(episode_history, key=lambda x: (x["final_score"], -x["steps_taken"]), reverse=True)
|
| 961 |
top_10 = []
|
| 962 |
for i, rec in enumerate(sorted_eps[:10]):
|
|
|
|
| 1059 |
|
| 1060 |
@app.post("/multi-agent/reset")
|
| 1061 |
def multi_agent_reset(body: MultiAgentResetRequest):
|
| 1062 |
+
"""
|
| 1063 |
+
Start a new dual-agent session with split observability.
|
| 1064 |
+
|
| 1065 |
+
Creates two views of the same incident:
|
| 1066 |
+
- Agent A (Observer): sees logs and active alerts only
|
| 1067 |
+
- Agent B (Responder): sees metrics and service dependencies only
|
| 1068 |
+
|
| 1069 |
+
Args:
|
| 1070 |
+
task_id: Task to run (same valid values as POST /reset)
|
| 1071 |
+
seed: Deterministic seed (default: 42)
|
| 1072 |
+
|
| 1073 |
+
Returns:
|
| 1074 |
+
session_id, agent roles, step instructions, and initial observations
|
| 1075 |
+
for both agents.
|
| 1076 |
+
"""
|
| 1077 |
session = DualAgentSession(task_id=body.task_id, seed=body.seed)
|
| 1078 |
multi_agent_sessions[session.session_id] = session
|
| 1079 |
return {
|
|
|
|
| 1093 |
|
| 1094 |
@app.post("/multi-agent/step/a/{session_id}")
|
| 1095 |
def multi_agent_step_a(session_id: str, body: AgentAStepRequest):
|
| 1096 |
+
"""
|
| 1097 |
+
Agent A (Observer) shares a finding with Agent B.
|
| 1098 |
+
|
| 1099 |
+
Agent A sees logs and alerts only. Findings are appended to the shared
|
| 1100 |
+
findings log that Agent B can see when deciding its next action.
|
| 1101 |
+
|
| 1102 |
+
Args:
|
| 1103 |
+
session_id: Session ID from POST /multi-agent/reset
|
| 1104 |
+
finding: Text description of what Agent A observed
|
| 1105 |
+
|
| 1106 |
+
Returns:
|
| 1107 |
+
Updated findings log and current Observer-view observation.
|
| 1108 |
+
"""
|
| 1109 |
session = multi_agent_sessions.get(session_id)
|
| 1110 |
if not session:
|
| 1111 |
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
| 1114 |
|
| 1115 |
@app.post("/multi-agent/step/b/{session_id}")
|
| 1116 |
def multi_agent_step_b(session_id: str, body: Action):
|
| 1117 |
+
"""
|
| 1118 |
+
Agent B (Responder) takes an action in the environment.
|
| 1119 |
+
|
| 1120 |
+
Agent B sees metrics and service dependencies. It receives all findings
|
| 1121 |
+
shared by Agent A, then executes an action. Action schema is identical
|
| 1122 |
+
to POST /step.
|
| 1123 |
+
|
| 1124 |
+
Args:
|
| 1125 |
+
session_id: Session ID from POST /multi-agent/reset
|
| 1126 |
+
body: Action object (same schema as POST /step)
|
| 1127 |
+
|
| 1128 |
+
Returns:
|
| 1129 |
+
StepResult with reward, done flag, and updated Responder-view observation.
|
| 1130 |
+
"""
|
| 1131 |
session = multi_agent_sessions.get(session_id)
|
| 1132 |
if not session:
|
| 1133 |
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
| 1136 |
|
| 1137 |
@app.get("/multi-agent/state/{session_id}")
|
| 1138 |
def multi_agent_state(session_id: str):
|
| 1139 |
+
"""
|
| 1140 |
+
Full state for a dual-agent session including both agent perspectives.
|
| 1141 |
+
|
| 1142 |
+
Returns:
|
| 1143 |
+
Session state with findings_log, step count, done flag,
|
| 1144 |
+
and both Observer and Responder observations.
|
| 1145 |
+
"""
|
| 1146 |
session = multi_agent_sessions.get(session_id)
|
| 1147 |
if not session:
|
| 1148 |
raise HTTPException(status_code=404, detail="Session not found")
|
|
|
|
| 1151 |
|
| 1152 |
@app.get("/multi-agent/sessions")
|
| 1153 |
def list_multi_agent_sessions():
|
| 1154 |
+
"""
|
| 1155 |
+
List all active dual-agent sessions.
|
| 1156 |
+
|
| 1157 |
+
Returns:
|
| 1158 |
+
List of active sessions with session_id, task_id, current step,
|
| 1159 |
+
done flag, and number of findings shared by Agent A.
|
| 1160 |
+
"""
|
| 1161 |
return [
|
| 1162 |
{
|
| 1163 |
"session_id": s.session_id,
|
|
|
|
| 1174 |
|
| 1175 |
@app.get("/curriculum/status")
|
| 1176 |
def get_curriculum_status():
|
| 1177 |
+
"""
|
| 1178 |
+
Agent mastery levels across all tasks.
|
| 1179 |
+
|
| 1180 |
+
Returns the curriculum engine's current view of agent performance:
|
| 1181 |
+
rolling average score, mastery level (0–3), whether scaffolding is
|
| 1182 |
+
needed, and a diagnostic hint per task.
|
| 1183 |
+
|
| 1184 |
+
Returns:
|
| 1185 |
+
{"tasks": {"easy": {"rolling_avg", "mastery_level", "scaffold_needed", "hint"}, ...},
|
| 1186 |
+
"recommended_task": "easy"}
|
| 1187 |
+
"""
|
| 1188 |
return curriculum_engine.get_status()
|
| 1189 |
|
| 1190 |
|
| 1191 |
@app.get("/curriculum/next")
|
| 1192 |
def get_next_curriculum_task():
|
| 1193 |
+
"""
|
| 1194 |
+
Recommended next task for adaptive training.
|
| 1195 |
+
|
| 1196 |
+
Returns the task with the lowest rolling average score among non-mastered
|
| 1197 |
+
tasks. Training loops should call this between episodes to implement
|
| 1198 |
+
curriculum learning automatically.
|
| 1199 |
+
|
| 1200 |
+
Returns:
|
| 1201 |
+
{"recommended_task": "medium", "reasoning": "..."}
|
| 1202 |
+
"""
|
| 1203 |
return {
|
| 1204 |
"recommended_task": curriculum_engine.get_next_curriculum_task(),
|
| 1205 |
"reasoning": "Lowest rolling average among non-mastered tasks.",
|
|
|
|
| 1208 |
|
| 1209 |
@app.post("/curriculum/record")
|
| 1210 |
def record_curriculum_episode(req: CurriculumRecordRequest):
|
| 1211 |
+
"""
|
| 1212 |
+
Record an episode result to update the curriculum engine.
|
| 1213 |
+
|
| 1214 |
+
Training loops should call this after each episode to keep the
|
| 1215 |
+
curriculum engine's rolling averages and mastery levels current.
|
| 1216 |
+
|
| 1217 |
+
Args:
|
| 1218 |
+
task_id: Task that was just run
|
| 1219 |
+
score: Episode score (float, typically 0.0–1.0)
|
| 1220 |
+
|
| 1221 |
+
Returns:
|
| 1222 |
+
{"recorded": true, "new_status": {...}} — updated task status
|
| 1223 |
+
"""
|
| 1224 |
try:
|
| 1225 |
curriculum_engine.record_episode(req.task_id, req.score)
|
| 1226 |
except ValueError as exc:
|
|
|
|
| 1233 |
|
| 1234 |
@app.get("/curriculum/hint/{task_id}")
|
| 1235 |
def get_curriculum_hint(task_id: str):
|
| 1236 |
+
"""
|
| 1237 |
+
Get a diagnostic hint and scaffold flag for a specific task.
|
| 1238 |
+
|
| 1239 |
+
If an agent is repeatedly failing a task, this returns a structured hint
|
| 1240 |
+
explaining what the agent should try (e.g., "read logs before acting").
|
| 1241 |
+
|
| 1242 |
+
Args:
|
| 1243 |
+
task_id: One of easy/medium/hard/bonus/security/database/failover
|
| 1244 |
+
|
| 1245 |
+
Returns:
|
| 1246 |
+
{"task_id", "hint", "scaffold_needed": bool, "mastery_level": 0–3}
|
| 1247 |
+
"""
|
| 1248 |
try:
|
| 1249 |
return {
|
| 1250 |
"task_id": task_id,
|