Spaces:

XcodeAddy
/

incident-triage-env

Running

App Files Files Community

XcodeAddy commited on Apr 12

Commit

b6d1ff0

1 Parent(s): b708ea9

Fix evaluation reliability and lifecycle issues

Browse files

Files changed (7) hide show

app.py +49 -9
environment.py +49 -3
graders.py +9 -0
inference.py +21 -8
openenv.yaml +2 -0
tests/test_env.py +76 -0
tests/test_inference.py +41 -0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import uuid
 from collections import Counter
 from pathlib import Path
@@ -21,10 +22,6 @@ from models import (
     TaskType,
 )
-app = FastAPI(title="Incident Triage Environment")
-UI_DIR = Path(__file__).parent / "ui"
-ASSETS_DIR = UI_DIR / "assets"
 # Session store: session_id -> IncidentEnv instance
 MAX_SESSIONS = 500
 sessions: dict[str, IncidentEnv] = {}
@@ -32,6 +29,35 @@ completed_states: dict[str, IncidentState] = {}
 session_lock = RLock()
 task_counts = Counter(ticket["task_type"] for ticket in TICKETS)
 app.mount("/assets", StaticFiles(directory=ASSETS_DIR), name="assets")
@@ -48,6 +74,15 @@ def evict_oldest(mapping: dict[str, Any], max_size: int) -> None:
         mapping.pop(oldest_key, None)
 @app.get("/", include_in_schema=False)
 def home_page():
     return FileResponse(UI_DIR / "index.html")
@@ -161,8 +196,7 @@ def reset(reset_request: ResetRequest | None = None):
         evict_oldest(sessions, MAX_SESSIONS)
         evict_oldest(completed_states, MAX_SESSIONS)
         sessions[session_id] = env
-    result.info["session_id"] = session_id
-    result.info["state"] = env.state(session_id=session_id).model_dump()
     log_event(
         "RESET",
         session_id=session_id,
@@ -188,9 +222,8 @@ def step(action: IncidentAction, session_id: str):
         except (RuntimeError, ValueError) as e:
             log_event("STEP_ERROR", session_id=session_id, incident_id=action.incident_id, error=str(e))
             raise HTTPException(status_code=400, detail=str(e))
-        result.info["session_id"] = session_id
         current_state = env.state(session_id=session_id)
-        result.info["state"] = current_state.model_dump()
         if result.done:
             completed_states[session_id] = current_state
             sessions.pop(session_id, None)
@@ -235,7 +268,14 @@ def get_grader_info():
             "task1": "exact=1.0, adjacent=0.5, far=0.0",
             "task2": "exact=1.0, related-domain=0.5, unknown=0.25, wrong=0.0",
             "task3": "exact=1.0, investigate fallback=0.4, related response=0.25, wrong=0.0",
-        }
     }

+from contextlib import asynccontextmanager
 import uuid
 from collections import Counter
 from pathlib import Path
     TaskType,
 )
 # Session store: session_id -> IncidentEnv instance
 MAX_SESSIONS = 500
 sessions: dict[str, IncidentEnv] = {}
 session_lock = RLock()
 task_counts = Counter(ticket["task_type"] for ticket in TICKETS)
+def emit_lifecycle_event(event: str, **fields: Any) -> None:
+    details = " ".join(f"{key}={value}" for key, value in fields.items())
+    print(f"[{event}] {details}", file=sys.stderr, flush=True)
+@asynccontextmanager
+async def lifespan(_: FastAPI):
+    emit_lifecycle_event("STARTUP", status="ready")
+    try:
+        yield
+    finally:
+        with session_lock:
+            active_count = len(sessions)
+            completed_count = len(completed_states)
+            sessions.clear()
+            completed_states.clear()
+        emit_lifecycle_event(
+            "SHUTDOWN",
+            active_sessions=active_count,
+            completed_sessions=completed_count,
+            status="cleared",
+        )
+app = FastAPI(title="Incident Triage Environment", lifespan=lifespan)
+UI_DIR = Path(__file__).parent / "ui"
+ASSETS_DIR = UI_DIR / "assets"
 app.mount("/assets", StaticFiles(directory=ASSETS_DIR), name="assets")
         mapping.pop(oldest_key, None)
+def enrich_step_result(result: StepResult, session_id: str, state: IncidentState) -> StepResult:
+    enriched_info = {
+        **result.info,
+        "session_id": session_id,
+        "state": state.model_dump(),
+    }
+    return result.model_copy(update={"info": enriched_info})
 @app.get("/", include_in_schema=False)
 def home_page():
     return FileResponse(UI_DIR / "index.html")
         evict_oldest(sessions, MAX_SESSIONS)
         evict_oldest(completed_states, MAX_SESSIONS)
         sessions[session_id] = env
+    result = enrich_step_result(result, session_id=session_id, state=env.state(session_id=session_id))
     log_event(
         "RESET",
         session_id=session_id,
         except (RuntimeError, ValueError) as e:
             log_event("STEP_ERROR", session_id=session_id, incident_id=action.incident_id, error=str(e))
             raise HTTPException(status_code=400, detail=str(e))
         current_state = env.state(session_id=session_id)
+        result = enrich_step_result(result, session_id=session_id, state=current_state)
         if result.done:
             completed_states[session_id] = current_state
             sessions.pop(session_id, None)
             "task1": "exact=1.0, adjacent=0.5, far=0.0",
             "task2": "exact=1.0, related-domain=0.5, unknown=0.25, wrong=0.0",
             "task3": "exact=1.0, investigate fallback=0.4, related response=0.25, wrong=0.0",
+        },
+        "notes": {
+            "task2": [
+                "DATABASE and APPLICATION are treated as related because application faults often surface as database pressure and vice versa.",
+                "NETWORK, INFRASTRUCTURE, and THIRD_PARTY share limited partial-credit bridges to reflect correlated outage signatures.",
+                "APPLICATION and THIRD_PARTY are intentionally not treated as related because they imply different remediation ownership.",
+            ]
+        },
     }

environment.py CHANGED Viewed

@@ -38,6 +38,35 @@ TASK_SPECS = {
         "description": "Choose the best immediate operational response for stabilizing the incident.",
     },
 }
 TICKETS_BY_ID = {ticket["incident_id"]: ticket for ticket in TICKETS}
@@ -100,13 +129,12 @@ class IncidentEnv:
         self._validate_action(action)
         task_type = self.current_ticket["task_type"]
-        ground_truth = self.current_ticket["ground_truth"]
         grader_fn = GRADERS[task_type]
         reward_value, reward_reason = grader_fn(action, ground_truth)
         agent_answer = action.selected_value() or "NONE"
         selected_field = action.selected_field() or "NONE"
-        ground_truth_value = list(ground_truth.values())[0]
         self.step_count += 1
         self.last_reward = reward_value
@@ -171,7 +199,8 @@ class IncidentEnv:
         if not pool:
             raise ValueError(f"No tickets found for task_type: {task_type}")
-        chooser = random.Random(seed) if seed is not None else random
         return chooser.choice(pool)
     def _task_spec(self) -> dict:
@@ -208,3 +237,20 @@ class IncidentEnv:
                 f"Task '{self.current_ticket['task_type']}' expects field '{expected_field}', "
                 f"but got '{next(iter(populated))}'."
             )

         "description": "Choose the best immediate operational response for stabilizing the incident.",
     },
 }
+DEFAULT_RESET_SEED = 42
+def validate_ticket_dataset(tickets: list[dict]) -> None:
+    for ticket in tickets:
+        incident_id = ticket.get("incident_id", "<unknown>")
+        task_type_raw = ticket.get("task_type")
+        try:
+            task_type = TaskType(task_type_raw)
+        except ValueError as exc:
+            raise RuntimeError(
+                f"Ticket '{incident_id}' has unsupported task_type '{task_type_raw}'."
+            ) from exc
+        expected_field = TASK_SPECS[task_type]["expected_field"]
+        ground_truth = ticket.get("ground_truth")
+        if not isinstance(ground_truth, dict) or not ground_truth:
+            raise RuntimeError(f"Ticket '{incident_id}' has empty ground_truth.")
+        if set(ground_truth.keys()) != {expected_field}:
+            raise RuntimeError(
+                f"Ticket '{incident_id}' must define only '{expected_field}' in ground_truth."
+            )
+        if ground_truth.get(expected_field) in (None, ""):
+            raise RuntimeError(
+                f"Ticket '{incident_id}' has missing value for ground_truth['{expected_field}']."
+            )
+validate_ticket_dataset(TICKETS)
 TICKETS_BY_ID = {ticket["incident_id"]: ticket for ticket in TICKETS}
         self._validate_action(action)
         task_type = self.current_ticket["task_type"]
+        ground_truth, ground_truth_value = self._validated_ground_truth()
         grader_fn = GRADERS[task_type]
         reward_value, reward_reason = grader_fn(action, ground_truth)
         agent_answer = action.selected_value() or "NONE"
         selected_field = action.selected_field() or "NONE"
         self.step_count += 1
         self.last_reward = reward_value
         if not pool:
             raise ValueError(f"No tickets found for task_type: {task_type}")
+        effective_seed = seed if seed is not None else DEFAULT_RESET_SEED
+        chooser = random.Random(effective_seed)
         return chooser.choice(pool)
     def _task_spec(self) -> dict:
                 f"Task '{self.current_ticket['task_type']}' expects field '{expected_field}', "
                 f"but got '{next(iter(populated))}'."
             )
+    def _validated_ground_truth(self) -> tuple[dict, str]:
+        if self.current_ticket is None:
+            raise RuntimeError("No active episode. Call reset() first.")
+        incident_id = self.current_ticket["incident_id"]
+        expected_field = self._task_spec()["expected_field"]
+        ground_truth = self.current_ticket.get("ground_truth")
+        if not isinstance(ground_truth, dict) or not ground_truth:
+            raise RuntimeError(
+                f"Ticket '{incident_id}' has empty ground_truth. This is a dataset integrity error."
+            )
+        if expected_field not in ground_truth or ground_truth[expected_field] in (None, ""):
+            raise RuntimeError(
+                f"Ticket '{incident_id}' is missing ground_truth['{expected_field}']."
+            )
+        return ground_truth, str(ground_truth[expected_field])

graders.py CHANGED Viewed

@@ -1,6 +1,15 @@
 from models import IncidentAction
 _SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}
 _TASK2_RELATED_GROUPS = [
     {"DATABASE", "APPLICATION"},
     {"NETWORK", "INFRASTRUCTURE"},

 from models import IncidentAction
 _SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}
+# Related-domain partial credit is intentionally conservative.
+# DATABASE <-> APPLICATION captures incidents where app bugs manifest as
+# database saturation and vice versa.
+# NETWORK <-> INFRASTRUCTURE captures physical or platform-layer correlation.
+# NETWORK <-> THIRD_PARTY captures dependency outages that resemble network loss.
+# INFRASTRUCTURE <-> THIRD_PARTY captures external services failing through shared
+# platform primitives.
+# APPLICATION <-> THIRD_PARTY is intentionally not included because we treat
+# product-code failures and vendor degradation as materially different diagnoses.
 _TASK2_RELATED_GROUPS = [
     {"DATABASE", "APPLICATION"},
     {"NETWORK", "INFRASTRUCTURE"},

inference.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional
@@ -24,7 +25,7 @@ ENV_URL = os.environ.get("ENV_URL") or "http://localhost:7860"
 BENCHMARK = "incident-triage-env"
 MAX_TOKENS = 300
 TEMPERATURE = 0.0
-OUTPUT_PATH = Path("outputs/baseline_scores.json")
 SYSTEM_PROMPT = """You are an expert SRE triaging production incidents.
 You will receive an incident alert, structured context, and the expected output field.
@@ -377,7 +378,10 @@ def run_episode(
     return episode_result
-def write_results(results: List[Dict[str, Any]]) -> None:
     grouped: Dict[str, List[float]] = {}
     for result in results:
         grouped.setdefault(result["task_type"], []).append(result.get("score", 0.0))
@@ -397,16 +401,25 @@ def write_results(results: List[Dict[str, Any]]) -> None:
         "results": results,
     }
-    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
-    OUTPUT_PATH.write_text(json.dumps(summary, indent=2))
 def main() -> None:
     transport = build_transport()
-    model_client = create_model_client()
-    results = [run_episode(transport, model_client, ticket) for ticket in TICKETS]
-    write_results(results)
-    transport.close()
 if __name__ == "__main__":

 import json
 import os
 import re
+import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 BENCHMARK = "incident-triage-env"
 MAX_TOKENS = 300
 TEMPERATURE = 0.0
+OUTPUT_PATH = Path(os.environ.get("OUTPUT_PATH") or "/tmp/outputs/baseline_scores.json")
 SYSTEM_PROMPT = """You are an expert SRE triaging production incidents.
 You will receive an incident alert, structured context, and the expected output field.
     return episode_result
+def write_results(
+    results: List[Dict[str, Any]],
+    output_path: Path = OUTPUT_PATH,
+) -> None:
     grouped: Dict[str, List[float]] = {}
     for result in results:
         grouped.setdefault(result["task_type"], []).append(result.get("score", 0.0))
         "results": results,
     }
+    try:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(json.dumps(summary, indent=2))
+    except (PermissionError, OSError) as exc:
+        print(
+            f"[WARN] Could not write results file to {output_path}: {exc}. Scores were still emitted to stdout.",
+            file=sys.stderr,
+            flush=True,
+        )
 def main() -> None:
     transport = build_transport()
+    try:
+        model_client = create_model_client()
+        results = [run_episode(transport, model_client, ticket) for ticket in TICKETS]
+        write_results(results)
+    finally:
+        transport.close()
 if __name__ == "__main__":

openenv.yaml CHANGED Viewed

@@ -101,3 +101,5 @@ reproducibility:
   max_steps_per_episode: 1
   dataset_order: fixed TICKETS list order in incidents.py
   baseline_selection: deterministic ticket_id-driven evaluation across all tickets

   max_steps_per_episode: 1
   dataset_order: fixed TICKETS list order in incidents.py
   baseline_selection: deterministic ticket_id-driven evaluation across all tickets
+  default_reset_seed: 42
+  reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool

tests/test_env.py CHANGED Viewed

@@ -3,6 +3,8 @@ import unittest
 from fastapi.testclient import TestClient
 from app import app, completed_states, sessions
 class IncidentEnvApiTests(unittest.TestCase):
@@ -33,6 +35,12 @@ class IncidentEnvApiTests(unittest.TestCase):
         self.assertEqual(mcp_body["jsonrpc"], "2.0")
         self.assertEqual(mcp_body["id"], 1)
     def test_tickets_endpoint_returns_safe_ticket_inventory(self) -> None:
         response = self.client.get("/tickets")
         self.assertEqual(response.status_code, 200)
@@ -78,6 +86,17 @@ class IncidentEnvApiTests(unittest.TestCase):
         self.assertIn("session_id", body["info"])
         self.assertEqual(body["info"]["state"]["status"], "awaiting_action")
     def test_step_completes_episode_and_state_endpoint_reflects_completion(self) -> None:
         reset_response = self.client.post(
             "/reset",
@@ -140,6 +159,63 @@ class IncidentEnvApiTests(unittest.TestCase):
         self.assertEqual(step_response.status_code, 400)
         self.assertIn("does not match", step_response.json()["detail"])
 if __name__ == "__main__":
     unittest.main()

 from fastapi.testclient import TestClient
 from app import app, completed_states, sessions
+from environment import IncidentEnv, validate_ticket_dataset
+from models import IncidentAction, IncidentState, TaskType
 class IncidentEnvApiTests(unittest.TestCase):
         self.assertEqual(mcp_body["jsonrpc"], "2.0")
         self.assertEqual(mcp_body["id"], 1)
+        grader_response = self.client.get("/grader")
+        self.assertEqual(grader_response.status_code, 200)
+        grader_body = grader_response.json()
+        self.assertIn("notes", grader_body)
+        self.assertIn("task2", grader_body["notes"])
     def test_tickets_endpoint_returns_safe_ticket_inventory(self) -> None:
         response = self.client.get("/tickets")
         self.assertEqual(response.status_code, 200)
         self.assertIn("session_id", body["info"])
         self.assertEqual(body["info"]["state"]["status"], "awaiting_action")
+    def test_reset_without_seed_is_deterministic_for_same_task(self) -> None:
+        first_response = self.client.post("/reset", json={"task_type": "task2"})
+        second_response = self.client.post("/reset", json={"task_type": "task2"})
+        self.assertEqual(first_response.status_code, 200)
+        self.assertEqual(second_response.status_code, 200)
+        self.assertEqual(
+            first_response.json()["observation"]["incident_id"],
+            second_response.json()["observation"]["incident_id"],
+        )
     def test_step_completes_episode_and_state_endpoint_reflects_completion(self) -> None:
         reset_response = self.client.post(
             "/reset",
         self.assertEqual(step_response.status_code, 400)
         self.assertIn("does not match", step_response.json()["detail"])
+    def test_dataset_validation_rejects_empty_ground_truth(self) -> None:
+        with self.assertRaisesRegex(RuntimeError, "empty ground_truth"):
+            validate_ticket_dataset(
+                [
+                    {
+                        "incident_id": "INC-BAD",
+                        "task_type": "task1",
+                        "alert_text": "Broken test ticket",
+                        "context": {},
+                        "ground_truth": {},
+                    }
+                ]
+            )
+    def test_step_raises_clear_dataset_error_for_invalid_ground_truth(self) -> None:
+        env = IncidentEnv()
+        env.current_ticket = {
+            "incident_id": "INC-BAD",
+            "task_type": "task1",
+            "alert_text": "Broken test ticket",
+            "context": {},
+            "ground_truth": {},
+        }
+        env.episode_id = "episode-bad"
+        with self.assertRaisesRegex(RuntimeError, "dataset integrity error"):
+            env.step(
+                IncidentAction(
+                    incident_id="INC-BAD",
+                    task_type="task1",
+                    severity="SEV1",
+                )
+            )
+    def test_lifespan_shutdown_clears_session_stores(self) -> None:
+        sessions["active-session"] = IncidentEnv()
+        completed_states["done-session"] = IncidentState(
+            episode_id="episode-1",
+            session_id="done-session",
+            step_count=1,
+            max_steps=1,
+            total_reward=1.0,
+            done=True,
+            incident_id="INC-001",
+            task_type=TaskType.TASK1,
+            difficulty="easy",
+            status="completed",
+            last_reward=1.0,
+        )
+        with TestClient(app) as client:
+            response = client.get("/health")
+            self.assertEqual(response.status_code, 200)
+        self.assertEqual(sessions, {})
+        self.assertEqual(completed_states, {})
 if __name__ == "__main__":
     unittest.main()

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import json
+import tempfile
+import unittest
+from pathlib import Path
+from inference import write_results
+class InferenceOutputTests(unittest.TestCase):
+    def test_write_results_writes_summary_to_configured_path(self) -> None:
+        results = [
+            {"incident_id": "INC-001", "task_type": "task1", "score": 1.0, "success": True},
+            {"incident_id": "INC-002", "task_type": "task2", "score": 0.5, "success": False},
+        ]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            output_path = Path(temp_dir) / "nested" / "baseline_scores.json"
+            write_results(results, output_path=output_path)
+            self.assertTrue(output_path.exists())
+            payload = json.loads(output_path.read_text())
+            self.assertEqual(payload["episodes"], 2)
+            self.assertAlmostEqual(payload["average_score"], 0.75)
+            self.assertEqual(payload["by_task"]["task1"]["average_score"], 1.0)
+            self.assertEqual(payload["by_task"]["task2"]["average_score"], 0.5)
+    def test_write_results_tolerates_unwritable_path(self) -> None:
+        results = [
+            {"incident_id": "INC-001", "task_type": "task1", "score": 1.0, "success": True},
+        ]
+        with tempfile.TemporaryDirectory() as temp_dir:
+            blocked_parent = Path(temp_dir) / "blocked"
+            blocked_parent.write_text("not-a-directory")
+            blocked_path = blocked_parent / "baseline_scores.json"
+            write_results(results, output_path=blocked_path)
+if __name__ == "__main__":
+    unittest.main()