Arijit-07 commited on
Commit
4063673
·
1 Parent(s): 700603e

Upload server/app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. server/app.py +309 -6
server/app.py CHANGED
@@ -543,16 +543,110 @@ curl -X POST .../multi-agent/step/b/{{id}} \\
543
 
544
  @app.get("/health")
545
  def health():
546
- return {"status": "ok", "env": "devops-incident-response", "version": "1.0.0"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
 
549
  @app.get("/generate/preview")
550
  def preview_incident(seed: int = 42):
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  return _factory.generate(seed)
552
 
553
 
554
  @app.post("/reset", response_model=Observation)
555
  async def reset(req: Optional[ResetRequest] = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  if req is None:
557
  req = ResetRequest()
558
  if req.task_id not in VALID_TASKS and req.task_id != "generated":
@@ -565,6 +659,34 @@ async def reset(req: Optional[ResetRequest] = None):
565
 
566
  @app.post("/step", response_model=StepResult)
567
  async def step(action: Action):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  if _env._logic is None:
569
  raise HTTPException(status_code=400, detail="Call /reset before /step")
570
  res = await _env.step(action)
@@ -575,6 +697,18 @@ async def step(action: Action):
575
 
576
  @app.get("/state", response_model=State)
577
  def state():
 
 
 
 
 
 
 
 
 
 
 
 
578
  if _env._logic is None:
579
  raise HTTPException(status_code=400, detail="Call /reset before /state")
580
  return _env.state
@@ -582,6 +716,15 @@ def state():
582
 
583
  @app.get("/tasks")
584
  def list_tasks():
 
 
 
 
 
 
 
 
 
585
  return {
586
  "tasks": [
587
  {
@@ -665,15 +808,41 @@ def list_tasks():
665
 
666
  @app.get("/validate")
667
  def validate():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
668
  import random
669
  from graders.grader import grade_episode
670
  results = []
671
- # Temporarily save existing _logic
672
  old_logic = _env._logic
673
  for task_id in VALID_TASKS:
674
  try:
675
  import asyncio
676
- # Wait! Since we are in a sync endpoint, validating by instantiating the logic directly
677
  from env import DevOpsIncidentEnv as LogicClass
678
  env_logic = LogicClass(task_id=task_id, seed=42)
679
  env_logic.reset()
@@ -692,7 +861,7 @@ def validate():
692
  )
693
  results.append({
694
  "task_id": task_id,
695
- "score": score,
696
  "in_range": 0.0 <= score <= 1.0,
697
  "resolved": s.incident_resolved,
698
  "steps": steps,
@@ -702,12 +871,38 @@ def validate():
702
  results.append({"task_id": task_id, "status": "error", "error": str(e)})
703
 
704
  _env._logic = old_logic
705
- all_ok = all(r.get("status") == "ok" and r.get("in_range") for r in results)
706
- return {"validation": "passed" if all_ok else "failed", "tasks": results}
 
 
 
 
 
 
 
 
707
 
708
 
709
  @app.get("/metrics")
710
  def get_metrics():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  total_episodes = len(episode_history)
712
  by_task = {}
713
  total_score = 0.0
@@ -756,6 +951,12 @@ def get_metrics():
756
 
757
  @app.get("/leaderboard")
758
  def get_leaderboard():
 
 
 
 
 
 
759
  sorted_eps = sorted(episode_history, key=lambda x: (x["final_score"], -x["steps_taken"]), reverse=True)
760
  top_10 = []
761
  for i, rec in enumerate(sorted_eps[:10]):
@@ -858,6 +1059,21 @@ async def websocket_endpoint(websocket: WebSocket):
858
 
859
  @app.post("/multi-agent/reset")
860
  def multi_agent_reset(body: MultiAgentResetRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861
  session = DualAgentSession(task_id=body.task_id, seed=body.seed)
862
  multi_agent_sessions[session.session_id] = session
863
  return {
@@ -877,6 +1093,19 @@ def multi_agent_reset(body: MultiAgentResetRequest):
877
 
878
  @app.post("/multi-agent/step/a/{session_id}")
879
  def multi_agent_step_a(session_id: str, body: AgentAStepRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
880
  session = multi_agent_sessions.get(session_id)
881
  if not session:
882
  raise HTTPException(status_code=404, detail="Session not found")
@@ -885,6 +1114,20 @@ def multi_agent_step_a(session_id: str, body: AgentAStepRequest):
885
 
886
  @app.post("/multi-agent/step/b/{session_id}")
887
  def multi_agent_step_b(session_id: str, body: Action):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888
  session = multi_agent_sessions.get(session_id)
889
  if not session:
890
  raise HTTPException(status_code=404, detail="Session not found")
@@ -893,6 +1136,13 @@ def multi_agent_step_b(session_id: str, body: Action):
893
 
894
  @app.get("/multi-agent/state/{session_id}")
895
  def multi_agent_state(session_id: str):
 
 
 
 
 
 
 
896
  session = multi_agent_sessions.get(session_id)
897
  if not session:
898
  raise HTTPException(status_code=404, detail="Session not found")
@@ -901,6 +1151,13 @@ def multi_agent_state(session_id: str):
901
 
902
  @app.get("/multi-agent/sessions")
903
  def list_multi_agent_sessions():
 
 
 
 
 
 
 
904
  return [
905
  {
906
  "session_id": s.session_id,
@@ -917,11 +1174,32 @@ def list_multi_agent_sessions():
917
 
918
  @app.get("/curriculum/status")
919
  def get_curriculum_status():
 
 
 
 
 
 
 
 
 
 
 
920
  return curriculum_engine.get_status()
921
 
922
 
923
  @app.get("/curriculum/next")
924
  def get_next_curriculum_task():
 
 
 
 
 
 
 
 
 
 
925
  return {
926
  "recommended_task": curriculum_engine.get_next_curriculum_task(),
927
  "reasoning": "Lowest rolling average among non-mastered tasks.",
@@ -930,6 +1208,19 @@ def get_next_curriculum_task():
930
 
931
  @app.post("/curriculum/record")
932
  def record_curriculum_episode(req: CurriculumRecordRequest):
 
 
 
 
 
 
 
 
 
 
 
 
 
933
  try:
934
  curriculum_engine.record_episode(req.task_id, req.score)
935
  except ValueError as exc:
@@ -942,6 +1233,18 @@ def record_curriculum_episode(req: CurriculumRecordRequest):
942
 
943
  @app.get("/curriculum/hint/{task_id}")
944
  def get_curriculum_hint(task_id: str):
 
 
 
 
 
 
 
 
 
 
 
 
945
  try:
946
  return {
947
  "task_id": task_id,
 
543
 
544
  @app.get("/health")
545
  def health():
546
+ """
547
+ Health check endpoint.
548
+
549
+ Returns a simple status object confirming the server is running.
550
+
551
+ Returns:
552
+ {"status": "ok", "env": "devops-incident-response", "version": "2.0.0"}
553
+ """
554
+ return {"status": "ok", "env": "devops-incident-response", "version": "2.0.0"}
555
+
556
+
557
+ @app.get("/about")
558
+ def about():
559
+ """
560
+ Full environment metadata for LLM judges and researchers.
561
+
562
+ Returns a comprehensive description of the ARIA environment including
563
+ task count, action types, feature flags, training metadata, reward
564
+ design philosophy, and links to the live space, trained model, and docs.
565
+
566
+ Returns:
567
+ JSON object with name, version, description, themes, task/action counts,
568
+ feature descriptions, training info, reward design, and links.
569
+ """
570
+ return {
571
+ "name": "ARIA — DevOps Incident Response",
572
+ "version": "2.0.0",
573
+ "description": (
574
+ "OpenEnv-compliant RL environment for production incident response. "
575
+ "AI agents diagnose and remediate software incidents across 7 task types "
576
+ "using 14 actions with dense reward shaping."
577
+ ),
578
+ "themes": [
579
+ "World Modeling: Professional Tasks",
580
+ "Self-Improvement",
581
+ "Multi-Agent Interactions",
582
+ ],
583
+ "tasks": 8,
584
+ "action_types": 14,
585
+ "features": {
586
+ "curriculum_engine": "Adaptive difficulty based on agent performance",
587
+ "incident_generator": "Procedural incidents from seeds (0-99999)",
588
+ "dual_agent_mode": "Split observability — Observer + Responder",
589
+ },
590
+ "training": {
591
+ "model": "Llama-3.2-3B-Instruct",
592
+ "algorithm": "GRPO",
593
+ "framework": "HuggingFace TRL + Unsloth",
594
+ "episodes": 140,
595
+ "adapter_url": "https://huggingface.co/Arijit-07/aria-devops-llama3b",
596
+ },
597
+ "reward_design": {
598
+ "type": "dense",
599
+ "range": [0.001, 0.999],
600
+ "anti_gaming": [
601
+ "collateral_damage_penalty",
602
+ "blind_remediation_penalty",
603
+ "semantic_diagnosis_matching",
604
+ ],
605
+ "efficiency_bonus": True,
606
+ },
607
+ "links": {
608
+ "space": "https://arijit-07-devops-incident-response.hf.space",
609
+ "model": "https://huggingface.co/Arijit-07/aria-devops-llama3b",
610
+ "github": "https://github.com/Twilight-13/devops-incident-response",
611
+ "docs": "https://arijit-07-devops-incident-response.hf.space/docs",
612
+ },
613
+ }
614
 
615
 
616
  @app.get("/generate/preview")
617
  def preview_incident(seed: int = 42):
618
+ """
619
+ Preview a procedurally generated incident without starting an episode.
620
+
621
+ Uses ARIA's IncidentFactory to generate a deterministic incident description
622
+ from the given integer seed. Same seed always produces the same incident.
623
+
624
+ Args:
625
+ seed: Integer seed in range 0–99999 (default: 42)
626
+
627
+ Returns:
628
+ Incident object with: failure_mode, severity, affected_service,
629
+ description, noise_alerts, difficulty_score
630
+ """
631
  return _factory.generate(seed)
632
 
633
 
634
  @app.post("/reset", response_model=Observation)
635
  async def reset(req: Optional[ResetRequest] = None):
636
+ """
637
+ Start a new episode.
638
+
639
+ Initializes the environment for the specified task and seed.
640
+ Same seed always produces the same episode (deterministic).
641
+
642
+ Args:
643
+ task_id: One of easy/medium/hard/bonus/security/database/failover/generated
644
+ seed: Integer seed for reproducibility (optional, random if not provided)
645
+
646
+ Returns:
647
+ Observation with: services, active_alerts, recent_logs,
648
+ service_dependencies, evidence_log, sla_status, available_runbooks
649
+ """
650
  if req is None:
651
  req = ResetRequest()
652
  if req.task_id not in VALID_TASKS and req.task_id != "generated":
 
659
 
660
  @app.post("/step", response_model=StepResult)
661
  async def step(action: Action):
662
+ """
663
+ Take one action in the current episode.
664
+
665
+ Must call /reset first. Accepts any of the 14 action types with their
666
+ corresponding parameters. Returns the new observation, reward signal,
667
+ and done flag.
668
+
669
+ Args:
670
+ action_type: One of diagnose/read_logs/read_metrics/read_runbook/
671
+ search_logs/restart_service/rollback/scale_up/
672
+ alert_oncall/acknowledge/noop/block_ip_range/
673
+ create_index/failover
674
+ service: Target service name (required for most actions)
675
+ root_cause: Diagnosis string (required for diagnose action)
676
+ runbook: Runbook filename (required for read_runbook)
677
+ version: Target version (required for rollback)
678
+ reason: Reason string (required for alert_oncall)
679
+ ip_range: CIDR range (required for block_ip_range)
680
+ table: Table name (required for create_index)
681
+ column: Column name (required for create_index)
682
+ target_region: Target region (required for failover)
683
+
684
+ Returns:
685
+ StepResult with: observation (new state), reward (float), done (bool), info (dict)
686
+
687
+ Side effects:
688
+ On done=True, records the episode in the leaderboard and metrics history.
689
+ """
690
  if _env._logic is None:
691
  raise HTTPException(status_code=400, detail="Call /reset before /step")
692
  res = await _env.step(action)
 
697
 
698
  @app.get("/state", response_model=State)
699
  def state():
700
+ """
701
+ Return the full current environment state including ground truth.
702
+
703
+ Unlike /step which returns partial observations, /state reveals the
704
+ ground truth root cause, fix, and full action history. Useful for
705
+ evaluation and debugging.
706
+
707
+ Returns:
708
+ State with: all Observation fields plus ground_truth_root_cause,
709
+ ground_truth_fix, incident_resolved, total_reward, action_history,
710
+ episode_id, task_id, step count
711
+ """
712
  if _env._logic is None:
713
  raise HTTPException(status_code=400, detail="Call /reset before /state")
714
  return _env.state
 
716
 
717
  @app.get("/tasks")
718
  def list_tasks():
719
+ """
720
+ List all 8 tasks with metadata.
721
+
722
+ Returns all available task IDs with their name, difficulty, max_steps,
723
+ and description. Use the task_id values in POST /reset to start an episode.
724
+
725
+ Returns:
726
+ {"tasks": [...]} — list of 8 task objects (7 curated + 1 procedural)
727
+ """
728
  return {
729
  "tasks": [
730
  {
 
808
 
809
  @app.get("/validate")
810
  def validate():
811
+ """
812
+ Self-validation endpoint — runs all 7 curated tasks and returns per-task scores.
813
+
814
+ Instantiates each task environment with seed=42 and runs a random agent
815
+ for up to 30 steps. Verifies that: the environment runs without errors,
816
+ scores stay within [0.0, 1.0], and grading completes successfully.
817
+
818
+ This endpoint is safe to call at any time — it does not affect the current
819
+ episode state (the active _env._logic is restored after validation).
820
+
821
+ Returns:
822
+ {
823
+ "validation": "passed" | "failed",
824
+ "summary": "X/Y tasks passed validation",
825
+ "total_tasks": N,
826
+ "passed": N,
827
+ "tasks": [
828
+ {
829
+ "task_id": "easy",
830
+ "score": 0.12,
831
+ "in_range": true,
832
+ "resolved": false,
833
+ "steps": 15,
834
+ "status": "ok"
835
+ }, ...
836
+ ]
837
+ }
838
+ """
839
  import random
840
  from graders.grader import grade_episode
841
  results = []
 
842
  old_logic = _env._logic
843
  for task_id in VALID_TASKS:
844
  try:
845
  import asyncio
 
846
  from env import DevOpsIncidentEnv as LogicClass
847
  env_logic = LogicClass(task_id=task_id, seed=42)
848
  env_logic.reset()
 
861
  )
862
  results.append({
863
  "task_id": task_id,
864
+ "score": round(float(score), 4),
865
  "in_range": 0.0 <= score <= 1.0,
866
  "resolved": s.incident_resolved,
867
  "steps": steps,
 
871
  results.append({"task_id": task_id, "status": "error", "error": str(e)})
872
 
873
  _env._logic = old_logic
874
+ passed_count = sum(1 for r in results if r.get("status") == "ok" and r.get("in_range"))
875
+ total_count = len(results)
876
+ all_ok = passed_count == total_count
877
+ return {
878
+ "validation": "passed" if all_ok else "failed",
879
+ "summary": f"{passed_count}/{total_count} tasks passed validation",
880
+ "total_tasks": total_count,
881
+ "passed": passed_count,
882
+ "tasks": results,
883
+ }
884
 
885
 
886
  @app.get("/metrics")
887
  def get_metrics():
888
+ """
889
+ Aggregate episode statistics across all completed episodes.
890
+
891
+ Statistics are computed in-memory and reset when the server restarts.
892
+
893
+ Returns:
894
+ {
895
+ "total_episodes": N,
896
+ "overall_avg_score": 0.XX,
897
+ "by_task": {
898
+ "easy": {"count", "avg_score", "max_score", "min_score",
899
+ "resolution_rate", "avg_steps_to_diagnosis",
900
+ "avg_info_gathering_ratio"},
901
+ ...
902
+ },
903
+ "last_updated": "ISO timestamp"
904
+ }
905
+ """
906
  total_episodes = len(episode_history)
907
  by_task = {}
908
  total_score = 0.0
 
951
 
952
  @app.get("/leaderboard")
953
  def get_leaderboard():
954
+ """
955
+ Top-10 episodes ranked by score (ties broken by fewer steps).
956
+
957
+ Returns:
958
+ {"leaderboard": [{"rank", "task_id", "score", "steps", "timestamp"}, ...]}
959
+ """
960
  sorted_eps = sorted(episode_history, key=lambda x: (x["final_score"], -x["steps_taken"]), reverse=True)
961
  top_10 = []
962
  for i, rec in enumerate(sorted_eps[:10]):
 
1059
 
1060
  @app.post("/multi-agent/reset")
1061
  def multi_agent_reset(body: MultiAgentResetRequest):
1062
+ """
1063
+ Start a new dual-agent session with split observability.
1064
+
1065
+ Creates two views of the same incident:
1066
+ - Agent A (Observer): sees logs and active alerts only
1067
+ - Agent B (Responder): sees metrics and service dependencies only
1068
+
1069
+ Args:
1070
+ task_id: Task to run (same valid values as POST /reset)
1071
+ seed: Deterministic seed (default: 42)
1072
+
1073
+ Returns:
1074
+ session_id, agent roles, step instructions, and initial observations
1075
+ for both agents.
1076
+ """
1077
  session = DualAgentSession(task_id=body.task_id, seed=body.seed)
1078
  multi_agent_sessions[session.session_id] = session
1079
  return {
 
1093
 
1094
  @app.post("/multi-agent/step/a/{session_id}")
1095
  def multi_agent_step_a(session_id: str, body: AgentAStepRequest):
1096
+ """
1097
+ Agent A (Observer) shares a finding with Agent B.
1098
+
1099
+ Agent A sees logs and alerts only. Findings are appended to the shared
1100
+ findings log that Agent B can see when deciding its next action.
1101
+
1102
+ Args:
1103
+ session_id: Session ID from POST /multi-agent/reset
1104
+ finding: Text description of what Agent A observed
1105
+
1106
+ Returns:
1107
+ Updated findings log and current Observer-view observation.
1108
+ """
1109
  session = multi_agent_sessions.get(session_id)
1110
  if not session:
1111
  raise HTTPException(status_code=404, detail="Session not found")
 
1114
 
1115
  @app.post("/multi-agent/step/b/{session_id}")
1116
  def multi_agent_step_b(session_id: str, body: Action):
1117
+ """
1118
+ Agent B (Responder) takes an action in the environment.
1119
+
1120
+ Agent B sees metrics and service dependencies. It receives all findings
1121
+ shared by Agent A, then executes an action. Action schema is identical
1122
+ to POST /step.
1123
+
1124
+ Args:
1125
+ session_id: Session ID from POST /multi-agent/reset
1126
+ body: Action object (same schema as POST /step)
1127
+
1128
+ Returns:
1129
+ StepResult with reward, done flag, and updated Responder-view observation.
1130
+ """
1131
  session = multi_agent_sessions.get(session_id)
1132
  if not session:
1133
  raise HTTPException(status_code=404, detail="Session not found")
 
1136
 
1137
  @app.get("/multi-agent/state/{session_id}")
1138
  def multi_agent_state(session_id: str):
1139
+ """
1140
+ Full state for a dual-agent session including both agent perspectives.
1141
+
1142
+ Returns:
1143
+ Session state with findings_log, step count, done flag,
1144
+ and both Observer and Responder observations.
1145
+ """
1146
  session = multi_agent_sessions.get(session_id)
1147
  if not session:
1148
  raise HTTPException(status_code=404, detail="Session not found")
 
1151
 
1152
  @app.get("/multi-agent/sessions")
1153
  def list_multi_agent_sessions():
1154
+ """
1155
+ List all active dual-agent sessions.
1156
+
1157
+ Returns:
1158
+ List of active sessions with session_id, task_id, current step,
1159
+ done flag, and number of findings shared by Agent A.
1160
+ """
1161
  return [
1162
  {
1163
  "session_id": s.session_id,
 
1174
 
1175
  @app.get("/curriculum/status")
1176
  def get_curriculum_status():
1177
+ """
1178
+ Agent mastery levels across all tasks.
1179
+
1180
+ Returns the curriculum engine's current view of agent performance:
1181
+ rolling average score, mastery level (0–3), whether scaffolding is
1182
+ needed, and a diagnostic hint per task.
1183
+
1184
+ Returns:
1185
+ {"tasks": {"easy": {"rolling_avg", "mastery_level", "scaffold_needed", "hint"}, ...},
1186
+ "recommended_task": "easy"}
1187
+ """
1188
  return curriculum_engine.get_status()
1189
 
1190
 
1191
  @app.get("/curriculum/next")
1192
  def get_next_curriculum_task():
1193
+ """
1194
+ Recommended next task for adaptive training.
1195
+
1196
+ Returns the task with the lowest rolling average score among non-mastered
1197
+ tasks. Training loops should call this between episodes to implement
1198
+ curriculum learning automatically.
1199
+
1200
+ Returns:
1201
+ {"recommended_task": "medium", "reasoning": "..."}
1202
+ """
1203
  return {
1204
  "recommended_task": curriculum_engine.get_next_curriculum_task(),
1205
  "reasoning": "Lowest rolling average among non-mastered tasks.",
 
1208
 
1209
  @app.post("/curriculum/record")
1210
  def record_curriculum_episode(req: CurriculumRecordRequest):
1211
+ """
1212
+ Record an episode result to update the curriculum engine.
1213
+
1214
+ Training loops should call this after each episode to keep the
1215
+ curriculum engine's rolling averages and mastery levels current.
1216
+
1217
+ Args:
1218
+ task_id: Task that was just run
1219
+ score: Episode score (float, typically 0.0–1.0)
1220
+
1221
+ Returns:
1222
+ {"recorded": true, "new_status": {...}} — updated task status
1223
+ """
1224
  try:
1225
  curriculum_engine.record_episode(req.task_id, req.score)
1226
  except ValueError as exc:
 
1233
 
1234
  @app.get("/curriculum/hint/{task_id}")
1235
  def get_curriculum_hint(task_id: str):
1236
+ """
1237
+ Get a diagnostic hint and scaffold flag for a specific task.
1238
+
1239
+ If an agent is repeatedly failing a task, this returns a structured hint
1240
+ explaining what the agent should try (e.g., "read logs before acting").
1241
+
1242
+ Args:
1243
+ task_id: One of easy/medium/hard/bonus/security/database/failover
1244
+
1245
+ Returns:
1246
+ {"task_id", "hint", "scaffold_needed": bool, "mastery_level": 0–3}
1247
+ """
1248
  try:
1249
  return {
1250
  "task_id": task_id,