Spaces:

Anvit25
/

Meta-SRE

Sleeping

Anvit25 Claude Sonnet 4.6 commited on Apr 25

Commit

ad6248e

0 Parent(s):

Deploy Meta-SRE OpenEnv benchmark FastAPI server

FastAPI server implementing full OpenEnv standard API:
/reset, /step, /grade, /state, /tools, /health, /env/* aliases.
Runs 5 production incident simulation tasks with self-improving difficulty.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (20) hide show

.gitignore +14 -0
Dockerfile +13 -0
README.md +70 -0
app/__init__.py +0 -0
app/engine/__init__.py +0 -0
app/engine/manager.py +598 -0
app/engine/observability.py +470 -0
app/engine/sandbox.py +1040 -0
app/main.py +166 -0
app/models.py +101 -0
app/services/__init__.py +0 -0
app/services/ad_ranking/__init__.py +0 -0
app/services/capi_pipeline/__init__.py +0 -0
app/services/whatsapp_sync/__init__.py +0 -0
app/tools/__init__.py +0 -0
app/tools/definitions.py +761 -0
openenv.yaml +110 -0
requirements.txt +25 -0
training/__init__.py +0 -0
training/generator.py +399 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+dist/
+build/
+.env
+venv/
+training/dataset/
+training/train_unsloth.py
+*.jsonl
+*.png

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY app/ ./app/
+COPY training/ ./training/
+EXPOSE 7860
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+---
+title: Meta-SRE
+emoji: 🔧
+colorFrom: blue
+colorTo: red
+sdk: docker
+pinned: false
+license: mit
+short_description: OpenEnv benchmark – train LLMs to debug Meta production incidents
+---
+# Meta-SRE OpenEnv Benchmark
+A live simulation environment for training and evaluating LLM agents as Senior Site Reliability Engineers at Meta.
+## Connect with openenv_client
+```python
+import openenv_client
+env = openenv_client.connect("huggingface.co/spaces/Anvit25/Meta-SRE")
+obs = env.reset(task_id=1)
+done = False
+while not done:
+    action = your_agent.decide(obs)   # {"tool": ..., "params": {...}}
+    obs, reward, done, info = env.step(action)
+score = env.grade()
+print(f"Score: {score['normalized_score']:.3f}")
+```
+## Direct API
+```python
+import requests
+BASE = "https://anvit25-meta-sre.hf.space"
+obs   = requests.post(f"{BASE}/reset", json={"task_id": 1}).json()
+done  = False
+while not done:
+    action = your_agent.decide(obs)
+    result = requests.post(f"{BASE}/step", json=action).json()
+    obs    = result["observation"]
+    done   = result["done"]
+score = requests.get(f"{BASE}/grade").json()["normalized_score"]
+print(f"Score: {score:.3f}")
+```
+## Tasks
+| ID | Difficulty | Description |
+|----|-----------|-------------|
+| 1  | Easy | AttributeError — hallucinated dict method in ad_ranking |
+| 2  | Medium | Silent timestamp corruption (CAPI → ROAS degradation) |
+| 3  | Medium-Hard | DB connection pool exhaustion under load |
+| 4  | Hard | Circular FK migration cascading across services |
+| 5  | Hard | PII data exposure via DEBUG_MODE=True |
+## Endpoints
+- `POST /reset` — start episode (`{"task_id": 1-5}`)
+- `POST /step` — take action (`{"tool": "...", "params": {...}}`)
+- `GET /state` — current observation
+- `GET /grade` — episode score
+- `GET /tools` — available tools list
+- `GET /health` — health check

app/__init__.py ADDED Viewed

File without changes

app/engine/__init__.py ADDED Viewed

File without changes

app/engine/manager.py ADDED Viewed

	@@ -0,0 +1,598 @@

+"""
+Layer 3 – Task Grader, Reward Manager, and Episode Orchestrator.
+TaskGrader  : checks whether the current VFS state passes the hidden tests.
+RewardManager: computes per-step and terminal rewards.
+EpisodeManager: ties Layers 1-2-3 together and drives the OpenEnv step loop.
+"""
+from __future__ import annotations
+import uuid
+import time
+from typing import Any, Dict, List, Optional, Tuple
+from app.models import (
+    Observation, ActionResult, EpisodeResult, IncidentReport,
+    Alert, ServiceMetrics,
+)
+from app.engine.sandbox import VirtualFileSystem
+from app.engine.observability import MetricsEngine, DifficultyController
+# ---------------------------------------------------------------------------
+# Task definitions
+# ---------------------------------------------------------------------------
+TASK_DEFINITIONS: Dict[int, Dict] = {
+    1: {
+        "description": (
+            "INCIDENT: All /rank requests are returning HTTP 500. "
+            "The ad-ranking service is crashing on every call. "
+            "Find and fix the bug in ad_ranking/ranker.py."
+        ),
+        "sla_budget": 15,
+        "difficulty": "easy",
+        "bug_category": "data_corruption",
+        "affected_services": ["ad_ranking"],
+    },
+    2: {
+        "description": (
+            "INCIDENT: ROAS (Return on Ad Spend) has dropped 68% vs last week. "
+            "No services are crashing. Ad-ranking allocation decisions appear to be "
+            "based on conversion data from 1970. Trace the root cause."
+        ),
+        "sla_budget": 20,
+        "difficulty": "medium",
+        "bug_category": "data_corruption",
+        "affected_services": ["capi_pipeline", "ad_ranking"],
+    },
+    3: {
+        "description": (
+            "INCIDENT: WhatsApp message sync works fine under normal load but "
+            "hangs under peak traffic (>50 concurrent users). DB connection pool "
+            "is exhausted. Fix the resource leak."
+        ),
+        "sla_budget": 20,
+        "difficulty": "medium-hard",
+        "bug_category": "async_bugs",
+        "affected_services": ["whatsapp_sync"],
+    },
+    4: {
+        "description": (
+            "INCIDENT: Three services degraded simultaneously after the 02:14 UTC deploy. "
+            "Multiple P1 alerts are firing. Find the single root cause and fix it — "
+            "do NOT chase individual service symptoms."
+        ),
+        "sla_budget": 25,
+        "difficulty": "hard",
+        "bug_category": "red_herrings",
+        "affected_services": ["whatsapp_sync", "ad_ranking", "capi_pipeline"],
+    },
+    5: {
+        "description": (
+            "INCIDENT: Security scan flagged unusual /ingest response sizes. "
+            "Standard unit tests all pass. Find and close the data-exposure vulnerability "
+            "in the CAPI ingestor. Write a P0 incident report."
+        ),
+        "sla_budget": 20,
+        "difficulty": "hard",
+        "bug_category": "security_bugs",
+        "affected_services": ["capi_pipeline"],
+    },
+}
+DEPENDENCY_GRAPH: Dict[str, List[str]] = {
+    "ad_ranking":    ["capi_pipeline"],
+    "capi_pipeline": [],
+    "whatsapp_sync": ["capi_pipeline"],
+}
+# ---------------------------------------------------------------------------
+# Hidden graders — one per task
+# ---------------------------------------------------------------------------
+class TaskGrader:
+    """
+    Checks the VFS content against hidden test criteria.
+    Returns (passed, test_output_string, partial_score 0-1).
+    """
+    def __init__(self, vfs: VirtualFileSystem):
+        self.vfs = vfs
+    def run(self, task_id: int, suite: str = "unit") -> Tuple[bool, str, float]:
+        graders = {
+            1: self._grade_task1,
+            2: self._grade_task2,
+            3: self._grade_task3,
+            4: self._grade_task4,
+            5: self._grade_task5,
+        }
+        fn = graders.get(task_id)
+        if fn is None:
+            return False, "Unknown task", 0.0
+        return fn(suite)
+    # ------------------------------------------------------------------
+    # Task 1 – fix ad.get_clicks() → ad.get('clicks', 0)
+    # ------------------------------------------------------------------
+    def _grade_task1(self, suite: str) -> Tuple[bool, str, float]:
+        _, content = self.vfs.read_file("ad_ranking", "ranker.py")
+        has_bug = "ad.get_clicks()" in content
+        has_fix = "ad.get('clicks'" in content or "ad['clicks']" in content
+        if has_bug:
+            return False, (
+                "FAIL [unit] test_score_ads:\n"
+                "  AttributeError: 'dict' object has no attribute 'get_clicks'\n"
+                "  Line 22 still contains ad.get_clicks()\n"
+                "  1 test failed, 0 passed"
+            ), 0.0
+        if has_fix:
+            return True, (
+                "PASS [unit] test_score_ads: OK\n"
+                "PASS [unit] test_rank_returns_sorted_list: OK\n"
+                "PASS [unit] test_fetch_candidate_ads: OK\n"
+                "3 tests passed in 0.04 s"
+            ), 1.0
+        return False, (
+            "FAIL [unit] test_score_ads:\n"
+            "  Fix applied but ad click-rate accessor is incorrect.\n"
+            "  Expected: ad.get('clicks', 0)  or  ad['clicks']\n"
+            "  1 test failed"
+        ), 0.2
+    # ------------------------------------------------------------------
+    # Task 2 – fix timestamp threshold 1_000_000_000 → 1_000_000_000_000
+    # ------------------------------------------------------------------
+    def _grade_task2(self, suite: str) -> Tuple[bool, str, float]:
+        _, content = self.vfs.read_file("capi_pipeline", "transformer.py")
+        has_bug = "1_000_000_000:" in content or "1000000000:" in content
+        # Only count as fixed if the bug line is gone AND the correct threshold is in code
+        # (not just in comments — the comment already contains 1_000_000_000_000)
+        code_lines = [l for l in content.splitlines() if not l.strip().startswith("#")]
+        code_only = "\n".join(code_lines)
+        has_fix = not has_bug and (
+            "1_000_000_000_000" in code_only or
+            "1000000000000" in code_only or
+            "1e12" in code_only or
+            "10**12" in code_only
+        )
+        if suite == "unit" and not has_bug:
+            # Unit tests always pass because they don't check timestamp edge cases
+            return True, (
+                "PASS [unit] test_transform_purchase: OK\n"
+                "PASS [unit] test_batch_transform: OK\n"
+                "2 tests passed"
+            ), 0.4
+        if suite == "integration":
+            if has_fix:
+                return True, (
+                    "PASS [integration] test_timestamp_normalisation: OK\n"
+                    "  event_time 1700000000 → 1700000000 ✓\n"
+                    "  event_time 1700000000000 → 1700000000 ✓\n"
+                    "PASS [integration] test_roas_attribution_accuracy: OK\n"
+                    "  ROAS attribution error: 0.2% (threshold: 5%)\n"
+                    "2 tests passed"
+                ), 1.0
+            else:
+                return False, (
+                    "FAIL [integration] test_timestamp_normalisation:\n"
+                    "  event_time 1700000000 → 1700000 (expected: 1700000000)\n"
+                    "  Timestamps are being divided by 1000 incorrectly.\n"
+                    "  Root cause: threshold condition in _normalize_timestamp()\n"
+                    "1 test failed"
+                ), 0.0
+        # Default: run integration test
+        return self._grade_task2("integration")
+    # ------------------------------------------------------------------
+    # Task 3 – add finally: await self.db_pool.release(conn)
+    # ------------------------------------------------------------------
+    def _grade_task3(self, suite: str) -> Tuple[bool, str, float]:
+        _, content = self.vfs.read_file("whatsapp_sync", "handler.py")
+        has_finally = "finally:" in content
+        has_release = "db_pool.release(conn)" in content or "release(conn)" in content
+        if suite == "unit":
+            if not has_finally:
+                return False, (
+                    "PASS [unit] test_sync_messages_basic: OK\n"
+                    "PASS [unit] test_process_queue_empty: OK\n"
+                    "WARNING: Unit tests pass but connection leak not detectable without load test\n"
+                    "Run: run_tests('whatsapp_sync', 'load')"
+                ), 0.3
+            return True, (
+                "PASS [unit] test_sync_messages_basic: OK\n"
+                "PASS [unit] test_connection_released_on_success: OK\n"
+                "PASS [unit] test_connection_released_on_exception: OK\n"
+                "3 tests passed"
+            ), 0.6
+        if suite == "load":
+            if has_finally and has_release:
+                return True, (
+                    "PASS [load] test_100_concurrent_syncs:\n"
+                    "  Peak connections: 18/100 (nominal)\n"
+                    "  All 100 requests completed\n"
+                    "  Memory stable at 210 MB\n"
+                    "PASS [load] test_connection_pool_not_exhausted: OK\n"
+                    "2 load tests passed"
+                ), 1.0
+            else:
+                return False, (
+                    "FAIL [load] test_100_concurrent_syncs:\n"
+                    "  TooManyConnectionsError after 23 concurrent requests\n"
+                    "  Connection pool exhausted — connections not being released\n"
+                    "  Hint: Check sync_user_messages() for missing finally block\n"
+                    "1 load test failed"
+                ), 0.0
+        return self._grade_task3("load")
+    # ------------------------------------------------------------------
+    # Task 4 – rollback migration 003 (remove circular FK)
+    # ------------------------------------------------------------------
+    def _grade_task4(self, suite: str) -> Tuple[bool, str, float]:
+        _, content = self.vfs.read_file("whatsapp_sync", "db.py")
+        has_circular_fk = (
+            "REFERENCES message_threads" in content and
+            "REFERENCES messages" in content
+        )
+        migration_003_present = '"version": "003"' in content or "'version': '003'" in content
+        if suite == "unit":
+            if has_circular_fk:
+                return False, (
+                    "FAIL [unit] test_migration_003:\n"
+                    "  ForeignKeyViolationError: circular FK detected\n"
+                    "  messages.thread_id → message_threads.id\n"
+                    "  message_threads.parent_message_id → messages.id\n"
+                    "  Fix: remove ALTER TABLE messages ADD COLUMN thread_id ...\n"
+                    "1 test failed"
+                ), 0.0
+        if suite == "integration":
+            if not has_circular_fk:
+                return True, (
+                    "PASS [integration] test_all_services_recover: OK\n"
+                    "  whatsapp_sync: healthy ✓\n"
+                    "  ad_ranking: healthy ✓\n"
+                    "  capi_pipeline: healthy ✓\n"
+                    "PASS [integration] test_message_thread_creation: OK\n"
+                    "2 tests passed"
+                ), 1.0
+            else:
+                return False, (
+                    "FAIL [integration] test_all_services_recover:\n"
+                    "  whatsapp_sync still critical — migration 003 circular FK active\n"
+                    "  Use rollback() tool or edit db.py to remove the circular reference\n"
+                    "1 test failed"
+                ), 0.0
+        return self._grade_task4("integration")
+    # ------------------------------------------------------------------
+    # Task 5 – set DEBUG_MODE = False
+    # ------------------------------------------------------------------
+    def _grade_task5(self, suite: str) -> Tuple[bool, str, float]:
+        _, content = self.vfs.read_file("capi_pipeline", "ingestor.py")
+        has_debug_true = "DEBUG_MODE = True" in content
+        has_debug_false = "DEBUG_MODE = False" in content
+        if suite == "unit":
+            return True, (
+                "PASS [unit] test_ingest_valid_payload: OK\n"
+                "PASS [unit] test_ingest_empty_payload: OK\n"
+                "WARNING: Unit tests do not cover security — run security suite\n"
+                "2 tests passed (security NOT verified)"
+            ), 0.2
+        if suite == "security":
+            if has_debug_true:
+                return False, (
+                    "FAIL [security] test_no_pii_in_response:\n"
+                    "  Response body contains 'debug_data' key with raw user PII\n"
+                    "  debug_data.user_emails exposed in HTTP response\n"
+                    "  debug_data.raw_payload exposed in HTTP response\n"
+                    "  CVSS score: 7.5 (High) — data exposure\n"
+                    "  Fix: set DEBUG_MODE = False\n"
+                    "1 security test failed"
+                ), 0.0
+            elif has_debug_false:
+                return True, (
+                    "PASS [security] test_no_pii_in_response: OK\n"
+                    "  Response body: {'status': 'ok', 'processed': N} — no PII\n"
+                    "PASS [security] test_response_size_nominal: OK\n"
+                    "  Avg response size: 48 bytes (was 14 KB)\n"
+                    "PASS [security] test_debug_data_absent: OK\n"
+                    "3 security tests passed — vulnerability closed"
+                ), 1.0
+            else:
+                return False, (
+                    "FAIL [security] test_no_pii_in_response:\n"
+                    "  DEBUG_MODE value is ambiguous or missing\n"
+                    "  Expected: DEBUG_MODE = False\n"
+                    "1 security test failed"
+                ), 0.1
+        return self._grade_task5("security")
+    # ------------------------------------------------------------------
+    # Incident report grader
+    # ------------------------------------------------------------------
+    def grade_incident_report(
+        self, task_id: int, report: IncidentReport
+    ) -> float:
+        """Score 0.0–1.0 for incident report accuracy."""
+        expected = {
+            1: {
+                "root_cause_keywords": ["get_clicks", "attributeerror", "dict", "attribute"],
+                "expected_services": ["ad_ranking"],
+                "severity": "P0",
+            },
+            2: {
+                "root_cause_keywords": ["timestamp", "1000", "normalize", "capi", "transformer"],
+                "expected_services": ["capi_pipeline", "ad_ranking"],
+                "severity": "P1",
+            },
+            3: {
+                "root_cause_keywords": ["connection", "pool", "release", "finally", "async"],
+                "expected_services": ["whatsapp_sync"],
+                "severity": "P1",
+            },
+            4: {
+                "root_cause_keywords": ["migration", "003", "foreign key", "circular", "fk"],
+                "expected_services": ["whatsapp_sync"],
+                "severity": "P0",
+            },
+            5: {
+                "root_cause_keywords": ["debug", "pii", "exposure", "ingest", "security"],
+                "expected_services": ["capi_pipeline"],
+                "severity": "P0",
+            },
+        }
+        cfg = expected.get(task_id, {})
+        if not cfg:
+            return 0.0
+        score = 0.0
+        root_cause_lower = report.root_cause.lower()
+        keywords = cfg.get("root_cause_keywords", [])
+        keyword_hits = sum(1 for kw in keywords if kw in root_cause_lower)
+        score += min(keyword_hits / max(len(keywords), 1), 1.0) * 0.5
+        expected_svcs = set(cfg.get("expected_services", []))
+        reported_svcs = set(s.lower() for s in report.services_affected)
+        svc_score = len(expected_svcs & reported_svcs) / max(len(expected_svcs), 1)
+        score += svc_score * 0.3
+        if report.severity_classification == cfg.get("severity"):
+            score += 0.2
+        return round(score, 3)
+# ---------------------------------------------------------------------------
+# Reward Manager
+# ---------------------------------------------------------------------------
+class RewardManager:
+    """Computes step-level and terminal rewards."""
+    STEP_PENALTY = -0.1
+    SYNTAX_ERROR_PENALTY = -0.5
+    ROLLBACK_PENALTY = -1.0
+    SENIOR_SRE_PENALTY = -0.2
+    SYMPTOM_FIX_PENALTY = -0.3     # for Task 4 — fixing red herring services
+    PROGRESS_ERROR_DROP = +0.3     # error_rate drops >50%
+    PROGRESS_SERVICE_ID = +0.2     # correct root-cause service identified
+    PROGRESS_FILE_FOUND = +0.2     # correct file opened/edited
+    TERMINAL_TESTS_PASS = +1.0
+    TERMINAL_REPORT_MAX = +0.5
+    TERMINAL_SLA_BONUS = +0.3
+    TERMINAL_NO_REGRESS = +0.2
+    TERMINAL_SECURITY_PATCH = +0.5  # Task 5 only
+    MAX_POSSIBLE = 3.0
+    def __init__(self):
+        self._cumulative = 0.0
+        self._step_rewards: List[float] = []
+    def reset(self):
+        self._cumulative = 0.0
+        self._step_rewards.clear()
+    def step_reward(self, action: str, syntax_error: bool = False,
+                    symptom_fix: bool = False) -> float:
+        r = self.STEP_PENALTY
+        if syntax_error:
+            r += self.SYNTAX_ERROR_PENALTY
+        if action == "rollback":
+            r += self.ROLLBACK_PENALTY
+        if action == "ask_senior_sre":
+            r += self.SENIOR_SRE_PENALTY
+        if symptom_fix:
+            r += self.SYMPTOM_FIX_PENALTY
+        self._cumulative += r
+        self._step_rewards.append(r)
+        return round(r, 3)
+    def progress_reward(self, reason: str) -> float:
+        mapping = {
+            "error_drop":     self.PROGRESS_ERROR_DROP,
+            "service_id":     self.PROGRESS_SERVICE_ID,
+            "file_found":     self.PROGRESS_FILE_FOUND,
+        }
+        r = mapping.get(reason, 0.0)
+        self._cumulative += r
+        self._step_rewards.append(r)
+        return round(r, 3)
+    def terminal_reward(
+        self,
+        tests_passed: bool,
+        report_accuracy: float,
+        fixed_within_sla: bool,
+        no_regressions: bool,
+        task_id: int,
+    ) -> float:
+        r = 0.0
+        if tests_passed:
+            r += self.TERMINAL_TESTS_PASS
+        r += report_accuracy * self.TERMINAL_REPORT_MAX
+        if fixed_within_sla:
+            r += self.TERMINAL_SLA_BONUS
+        if no_regressions:
+            r += self.TERMINAL_NO_REGRESS
+        if task_id == 5 and tests_passed:
+            r += self.TERMINAL_SECURITY_PATCH
+        self._cumulative += r
+        return round(r, 3)
+    def normalized_score(self) -> float:
+        return round(max(0.0, min(self._cumulative / self.MAX_POSSIBLE, 1.0)), 4)
+    @property
+    def total(self) -> float:
+        return round(self._cumulative, 4)
+# ---------------------------------------------------------------------------
+# Episode Manager – the main orchestrator
+# ---------------------------------------------------------------------------
+class EpisodeManager:
+    """
+    Ties together VFS, MetricsEngine, TaskGrader, and RewardManager.
+    Exposes reset() and step() matching the OpenEnv contract.
+    """
+    def __init__(self, difficulty_controller: Optional[DifficultyController] = None):
+        self.vfs = VirtualFileSystem()
+        self.metrics = MetricsEngine()
+        self.grader: Optional[TaskGrader] = None
+        self.reward = RewardManager()
+        self.dc = difficulty_controller or DifficultyController()
+        self._task_id: int = 0
+        self._step: int = 0
+        self._done: bool = False
+        self._incident_id: str = ""
+        self._sre_memory: List[str] = []
+        self._tool_call_log: List[Dict] = []
+        self._last_terminal: str = ""
+        self._incident_report: Optional[IncidentReport] = None
+        self._start_time: float = 0.0
+    # ------------------------------------------------------------------
+    # OpenEnv: reset
+    # ------------------------------------------------------------------
+    def reset(self, task_id: Optional[int] = None) -> Observation:
+        self._task_id = task_id or self.dc.next_task_id()
+        self._step = 0
+        self._done = False
+        self._incident_id = f"INC-{self._task_id}-{uuid.uuid4().hex[:6].upper()}"
+        self._sre_memory = []
+        self._tool_call_log = []
+        self._last_terminal = ""
+        self._incident_report = None
+        self._start_time = time.time()
+        self.vfs.reset(self._task_id)
+        self.metrics.reset(self._task_id)
+        self.grader = TaskGrader(self.vfs)
+        self.reward.reset()
+        return self._build_observation()
+    # ------------------------------------------------------------------
+    # OpenEnv: step
+    # ------------------------------------------------------------------
+    def step(self, tool: str, params: Dict[str, Any]) -> ActionResult:
+        if self._done:
+            raise RuntimeError("Episode is done. Call reset() to start a new episode.")
+        self._step += 1
+        self._tool_call_log.append({"step": self._step, "tool": tool, "params": params})
+        # Dispatch to tool handler
+        from app.tools.definitions import ToolDispatcher
+        dispatcher = ToolDispatcher(self)
+        reward_delta, done, tool_output = dispatcher.dispatch(tool, params)
+        self._done = done
+        obs = self._build_observation()
+        return ActionResult(
+            tool=tool,
+            output=tool_output,
+            reward_delta=reward_delta,
+            done=done,
+            observation=obs,
+        )
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    def _build_observation(self) -> Observation:
+        task_def = TASK_DEFINITIONS.get(self._task_id, {})
+        budget = task_def.get("sla_budget", 20) - self._step
+        return Observation(
+            step=self._step,
+            incident_id=self._incident_id,
+            system_metrics=self.metrics.get_metrics(self._step),
+            active_alerts=self.metrics.get_alerts(self._step),
+            terminal_output=self.metrics.get_terminal_output(
+                self._step, self._last_terminal or None
+            ),
+            git_diff=self.vfs.build_git_diff(),
+            dependency_graph=DEPENDENCY_GRAPH,
+            sre_memory=list(self._sre_memory),
+            budget_remaining=max(budget, 0),
+            task_id=self._task_id,
+            task_description=task_def.get("description", ""),
+        )
+    def add_memory(self, entry: str) -> None:
+        self._sre_memory.append(f"[step {self._step}] {entry}")
+    def get_episode_result(self) -> EpisodeResult:
+        tests_passed = False
+        report_accuracy = 0.0
+        if self._incident_report:
+            report_accuracy = self.grader.grade_incident_report(
+                self._task_id, self._incident_report
+            )
+        task_def = TASK_DEFINITIONS.get(self._task_id, {})
+        fixed_within_sla = self._step <= task_def.get("sla_budget", 20)
+        return EpisodeResult(
+            incident_id=self._incident_id,
+            task_id=self._task_id,
+            steps_taken=self._step,
+            total_reward=self.reward.total,
+            normalized_score=self.reward.normalized_score(),
+            tests_passed=tests_passed,
+            incident_report_accuracy=report_accuracy,
+            fixed_within_sla=fixed_within_sla,
+            tool_call_log=list(self._tool_call_log),
+            weakness_tags=self.dc.weakness_tags(),
+        )

app/engine/observability.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""
+Layer 2 – Metrics Engine & Difficulty Controller.
+Simulates realistic service metrics that evolve based on:
+  • which task is active
+  • what step we are on
+  • whether any fixes have been applied (VFS state)
+"""
+from __future__ import annotations
+import random
+from typing import Dict, List, Optional, Tuple
+from app.models import ServiceMetrics, Alert, DifficultyState
+# ---------------------------------------------------------------------------
+# Per-task metric profiles
+# ---------------------------------------------------------------------------
+HEALTHY_METRICS: Dict[str, dict] = {
+    "ad_ranking": dict(
+        cpu_percent=12.0, memory_mb=256.0, error_rate=0.0,
+        p99_latency_ms=45.0, request_queue=3, last_deploy="2026-04-23 01:00 UTC",
+        status="healthy",
+    ),
+    "capi_pipeline": dict(
+        cpu_percent=8.0, memory_mb=180.0, error_rate=0.0,
+        p99_latency_ms=20.0, request_queue=0, last_deploy="2026-04-23 02:14 UTC",
+        status="healthy",
+    ),
+    "whatsapp_sync": dict(
+        cpu_percent=10.0, memory_mb=200.0, error_rate=0.0,
+        p99_latency_ms=35.0, request_queue=5, last_deploy="2026-04-22 18:30 UTC",
+        status="healthy",
+    ),
+}
+def _jitter(val: float, pct: float = 0.05) -> float:
+    return round(val * (1 + random.uniform(-pct, pct)), 2)
+class MetricsEngine:
+    """Generates per-step system metrics driven by task state."""
+    def __init__(self):
+        self._task_id: int = 0
+        self._fixed_services: set = set()
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def reset(self, task_id: int) -> None:
+        self._task_id = task_id
+        self._fixed_services.clear()
+    def mark_fixed(self, service: str) -> None:
+        self._fixed_services.add(service)
+    def mark_unfixed(self, service: str) -> None:
+        self._fixed_services.discard(service)
+    # ------------------------------------------------------------------
+    # Core metric generation
+    # ------------------------------------------------------------------
+    def get_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
+        builders = {
+            1: self._task1_metrics,
+            2: self._task2_metrics,
+            3: self._task3_metrics,
+            4: self._task4_metrics,
+            5: self._task5_metrics,
+        }
+        fn = builders.get(self._task_id, self._all_healthy)
+        return fn(step)
+    def _all_healthy(self, step: int) -> Dict[str, ServiceMetrics]:
+        return {
+            svc: ServiceMetrics(**{k: _jitter(v) if isinstance(v, float) else v
+                                   for k, v in metrics.items()})
+            for svc, metrics in HEALTHY_METRICS.items()
+        }
+    # ------------------------------------------------------------------
+    # Task 1 – ad_ranking crashes with AttributeError
+    # ------------------------------------------------------------------
+    def _task1_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
+        fixed = "ad_ranking" in self._fixed_services
+        return {
+            "ad_ranking": ServiceMetrics(
+                cpu_percent=_jitter(5.0 if fixed else 2.0),
+                memory_mb=_jitter(256.0),
+                error_rate=0.0 if fixed else _jitter(12.0),
+                p99_latency_ms=_jitter(45.0 if fixed else 0.0),
+                request_queue=3 if fixed else 0,
+                last_deploy="2026-04-23 02:14 UTC",
+                status="healthy" if fixed else "critical",
+            ),
+            "capi_pipeline": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["capi_pipeline"].items()
+            }),
+            "whatsapp_sync": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["whatsapp_sync"].items()
+            }),
+        }
+    # ------------------------------------------------------------------
+    # Task 2 – silent CAPI data corruption → ROAS degradation
+    # ------------------------------------------------------------------
+    def _task2_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
+        capi_fixed = "capi_pipeline" in self._fixed_services
+        ad_recovering = capi_fixed and step > 2   # needs a few steps to propagate
+        return {
+            "capi_pipeline": ServiceMetrics(
+                cpu_percent=_jitter(8.0),
+                memory_mb=_jitter(180.0),
+                error_rate=0.0,                     # no crash – silent corruption
+                p99_latency_ms=_jitter(20.0),
+                request_queue=0,
+                last_deploy="2026-04-23 02:14 UTC",
+                status="healthy",                   # deceptive – looks fine
+            ),
+            "ad_ranking": ServiceMetrics(
+                cpu_percent=_jitter(12.0),
+                memory_mb=_jitter(256.0),
+                error_rate=0.0,
+                p99_latency_ms=_jitter(45.0),
+                request_queue=3,
+                last_deploy="2026-04-22 18:00 UTC",
+                # ROAS in custom_data would be degraded but not visible here
+                status="healthy" if ad_recovering else "degraded",
+            ),
+            "whatsapp_sync": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["whatsapp_sync"].items()
+            }),
+        }
+    # ------------------------------------------------------------------
+    # Task 3 – memory leak in whatsapp_sync under load
+    # ------------------------------------------------------------------
+    def _task3_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
+        fixed = "whatsapp_sync" in self._fixed_services
+        # Memory climbs 50 MB per step until fixed
+        leaked_mb = min(128.0 + (step * 50.0), 1800.0)
+        return {
+            "ad_ranking": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["ad_ranking"].items()
+            }),
+            "capi_pipeline": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["capi_pipeline"].items()
+            }),
+            "whatsapp_sync": ServiceMetrics(
+                cpu_percent=_jitter(10.0 if fixed else min(15 + step * 3, 90)),
+                memory_mb=_jitter(256.0 if fixed else leaked_mb),
+                error_rate=0.0 if fixed else _jitter(0.05 * max(step - 3, 0)),
+                p99_latency_ms=_jitter(35.0 if fixed else min(35 + step * 80, 8000)),
+                request_queue=5 if fixed else min(5 + step * 20, 500),
+                last_deploy="2026-04-22 18:30 UTC",
+                status="healthy" if fixed else (
+                    "critical" if leaked_mb > 1200 else "degraded"
+                ),
+            ),
+        }
+    # ------------------------------------------------------------------
+    # Task 4 – bad migration cascades to all three services
+    # ------------------------------------------------------------------
+    def _task4_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
+        migration_rolled_back = "whatsapp_sync" in self._fixed_services
+        return {
+            "ad_ranking": ServiceMetrics(
+                cpu_percent=_jitter(12.0),
+                memory_mb=_jitter(256.0),
+                error_rate=0.0 if migration_rolled_back else _jitter(3.5),
+                p99_latency_ms=_jitter(45.0 if migration_rolled_back else 2200.0),
+                request_queue=3 if migration_rolled_back else 150,
+                last_deploy="2026-04-23 02:00 UTC",
+                status="healthy" if migration_rolled_back else "degraded",
+            ),
+            "capi_pipeline": ServiceMetrics(
+                cpu_percent=_jitter(8.0),
+                memory_mb=_jitter(180.0),
+                error_rate=0.0 if migration_rolled_back else _jitter(2.1),
+                p99_latency_ms=_jitter(20.0 if migration_rolled_back else 1100.0),
+                request_queue=0 if migration_rolled_back else 80,
+                last_deploy="2026-04-23 02:14 UTC",
+                status="healthy" if migration_rolled_back else "degraded",
+            ),
+            "whatsapp_sync": ServiceMetrics(
+                cpu_percent=_jitter(10.0),
+                memory_mb=_jitter(200.0),
+                error_rate=0.0 if migration_rolled_back else _jitter(8.0),
+                p99_latency_ms=_jitter(35.0 if migration_rolled_back else 5000.0),
+                request_queue=5 if migration_rolled_back else 400,
+                last_deploy="2026-04-23 02:14 UTC",
+                status="healthy" if migration_rolled_back else "critical",
+            ),
+        }
+    # ------------------------------------------------------------------
+    # Task 5 – PII data-leak (metrics look fine but security tests fail)
+    # ------------------------------------------------------------------
+    def _task5_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
+        fixed = "capi_pipeline" in self._fixed_services
+        return {
+            "capi_pipeline": ServiceMetrics(
+                cpu_percent=_jitter(8.0),
+                memory_mb=_jitter(180.0),
+                error_rate=0.0,                 # no crash – silent security hole
+                p99_latency_ms=_jitter(20.0),
+                request_queue=0,
+                last_deploy="2026-04-23 02:14 UTC",
+                status="healthy",               # deliberately deceptive
+            ),
+            "ad_ranking": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["ad_ranking"].items()
+            }),
+            "whatsapp_sync": ServiceMetrics(**{
+                k: _jitter(v) if isinstance(v, float) else v
+                for k, v in HEALTHY_METRICS["whatsapp_sync"].items()
+            }),
+        }
+    # ------------------------------------------------------------------
+    # Alerts
+    # ------------------------------------------------------------------
+    def get_alerts(self, step: int) -> List[Alert]:
+        alert_map = {
+            1: self._task1_alerts,
+            2: self._task2_alerts,
+            3: self._task3_alerts,
+            4: self._task4_alerts,
+            5: self._task5_alerts,
+        }
+        fn = alert_map.get(self._task_id, lambda s: [])
+        return fn(step)
+    def _task1_alerts(self, step: int) -> List[Alert]:
+        if "ad_ranking" in self._fixed_services:
+            return []
+        return [
+            Alert(
+                alert_id="ALT-001",
+                severity="P0",
+                service="ad_ranking",
+                message=(
+                    "AttributeError: 'dict' object has no attribute 'get_clicks' "
+                    "in ranker.py score_ads() — all ranking requests failing"
+                ),
+                triggered_at_step=0,
+                is_red_herring=False,
+            )
+        ]
+    def _task2_alerts(self, step: int) -> List[Alert]:
+        alerts = []
+        if "capi_pipeline" not in self._fixed_services:
+            alerts.append(Alert(
+                alert_id="ALT-002",
+                severity="P1",
+                service="ad_ranking",
+                message="ROAS dropped 68% vs 7-day average — attribution model seeing events from 1970",
+                triggered_at_step=0,
+                is_red_herring=False,
+            ))
+            # Red herring – ad_ranking looks degraded but it's CAPI's fault
+            alerts.append(Alert(
+                alert_id="ALT-003",
+                severity="P2",
+                service="ad_ranking",
+                message="High memory pressure on ad-ranking pod — possible cache thrash",
+                triggered_at_step=0,
+                is_red_herring=True,
+            ))
+        return alerts
+    def _task3_alerts(self, step: int) -> List[Alert]:
+        if "whatsapp_sync" in self._fixed_services:
+            return []
+        alerts = [Alert(
+            alert_id="ALT-004",
+            severity="P1" if step < 4 else "P0",
+            service="whatsapp_sync",
+            message=f"DB connection pool exhausted ({min(step * 20, 500)}/500 connections in use) — sync requests queuing",
+            triggered_at_step=1,
+            is_red_herring=False,
+        )]
+        if step > 3:
+            alerts.append(Alert(
+                alert_id="ALT-005",
+                severity="P1",
+                service="whatsapp_sync",
+                message="p99 latency > 5 s — SLA breach imminent",
+                triggered_at_step=4,
+                is_red_herring=False,
+            ))
+        return alerts
+    def _task4_alerts(self, step: int) -> List[Alert]:
+        if "whatsapp_sync" in self._fixed_services:
+            return []
+        return [
+            Alert(
+                alert_id="ALT-006",
+                severity="P0",
+                service="whatsapp_sync",
+                message="FK violation: insert into user_preferences fails — migration 003 circular FK",
+                triggered_at_step=0,
+                is_red_herring=False,
+            ),
+            # Red herrings – symptoms of the underlying migration failure
+            Alert(
+                alert_id="ALT-007",
+                severity="P1",
+                service="ad_ranking",
+                message="High error rate on /rank endpoint — upstream DB errors propagating",
+                triggered_at_step=0,
+                is_red_herring=True,
+            ),
+            Alert(
+                alert_id="ALT-008",
+                severity="P1",
+                service="capi_pipeline",
+                message="Event ingest latency spike — shared DB pool contention",
+                triggered_at_step=0,
+                is_red_herring=True,
+            ),
+        ]
+    def _task5_alerts(self, step: int) -> List[Alert]:
+        if "capi_pipeline" in self._fixed_services:
+            return []
+        return [
+            Alert(
+                alert_id="ALT-009",
+                severity="P0",
+                service="capi_pipeline",
+                message=(
+                    "SECURITY: Unusual response payload size on /ingest (avg 14 KB vs 0.2 KB) "
+                    "— possible PII exposure in debug response body"
+                ),
+                triggered_at_step=0,
+                is_red_herring=False,
+            ),
+            Alert(
+                alert_id="ALT-010",
+                severity="P2",
+                service="capi_pipeline",
+                message="Slightly elevated memory on ingestor pod — likely buffer growth",
+                triggered_at_step=0,
+                is_red_herring=True,
+            ),
+        ]
+    # ------------------------------------------------------------------
+    # Terminal output (simulated stack traces / logs)
+    # ------------------------------------------------------------------
+    def get_terminal_output(self, step: int, last_test_result: Optional[str] = None) -> str:
+        if last_test_result:
+            return last_test_result
+        outputs = {
+            1: (
+                "Traceback (most recent call last):\n"
+                "  File 'ad_ranking/ranker.py', line 22, in score_ads\n"
+                "    click_rate = ad.get_clicks() / max(ad.get('impressions', 1), 1)\n"
+                "AttributeError: 'dict' object has no attribute 'get_clicks'\n"
+                "[CRITICAL] /rank endpoint returning 500 for all requests"
+            ),
+            2: (
+                "[WARNING] ad_ranking: ROAS attribution anomaly detected\n"
+                "  Expected event_time range: 1700000000 – 1745500000\n"
+                "  Actual event_time range:   1700 – 1745500  (← timestamps in seconds / 1000!)\n"
+                "[INFO] capi_pipeline: All unit tests PASS\n"
+                "[INFO] capi_pipeline: Throughput 12,000 events/s — nominal\n"
+                "[WARNING] ad_ranking: Conversion window showing data from 1970-01-20"
+            ),
+            3: (
+                "[INFO] whatsapp_sync: process_queue started\n"
+                "[ERROR] asyncpg.exceptions.TooManyConnectionsError: "
+                "connection pool exhausted (max=100)\n"
+                "  Traceback: handler.py:sync_user_messages — acquire() blocked\n"
+                "[ERROR] Sync request for user 8841923 timed out after 30s\n"
+                "[CRITICAL] 487 pending sync requests queued"
+            ),
+            4: (
+                "[ERROR] asyncpg.exceptions.ForeignKeyViolationError:\n"
+                "  insert into user_preferences violates FK constraint "
+                "\"user_preferences_user_id_fkey\"\n"
+                "  DETAIL: Key (user_id)=(48291) is not present in table \"users\".\n"
+                "[ERROR] whatsapp_sync: message thread creation failing\n"
+                "[WARNING] ad_ranking: upstream DB pool returning errors\n"
+                "[WARNING] capi_pipeline: event association latency +340ms\n"
+                "  [HINT] Last DB migration was version 003 at 02:14 UTC today"
+            ),
+            5: (
+                "[SECURITY SCAN] capi_pipeline /ingest endpoint\n"
+                "  Response body contains keys: ['status', 'processed', 'debug_data']\n"
+                "  debug_data.user_emails contains raw PII hashes + plaintext fields\n"
+                "  debug_data.raw_payload contains full user submission data\n"
+                "[FAIL] Security test suite: test_no_pii_in_response FAILED\n"
+                "[INFO] Unit tests: all PASSING — bug invisible to standard tests"
+            ),
+        }
+        return outputs.get(self._task_id, "[INFO] All systems operational")
+# ---------------------------------------------------------------------------
+# Difficulty Controller (Theme 4 – Self-Improvement Loop)
+# ---------------------------------------------------------------------------
+class DifficultyController:
+    """
+    After each episode, analyse which bug categories the agent failed on.
+    Weight those categories higher so the next generated episode targets
+    the agent's current weaknesses.
+    """
+    BUG_CATEGORY_MAP: Dict[int, str] = {
+        1: "data_corruption",       # hallucinated attribute
+        2: "data_corruption",       # silent timestamp corruption
+        3: "async_bugs",            # connection leak
+        4: "red_herrings",          # cascading failure + red herrings
+        5: "security_bugs",         # PII leak
+    }
+    def __init__(self):
+        self.state = DifficultyState()
+    def update(self, task_id: int, normalized_score: float) -> None:
+        """Increase weight for the bug category this agent struggled with."""
+        category = self.BUG_CATEGORY_MAP.get(task_id)
+        if category is None:
+            return
+        current = getattr(self.state, category)
+        if normalized_score < 0.5:
+            # Agent struggled – raise difficulty weight
+            setattr(self.state, category, min(current * 1.25, 3.0))
+        elif normalized_score > 0.8:
+            # Agent mastered it – slightly reduce weight
+            setattr(self.state, category, max(current * 0.9, 0.3))
+    def next_task_id(self) -> int:
+        """Sample next task weighted by current weakness scores."""
+        import random
+        weights = [
+            (1, self.state.data_corruption),
+            (2, self.state.data_corruption),
+            (3, self.state.async_bugs),
+            (4, self.state.red_herrings),
+            (5, self.state.security_bugs),
+        ]
+        task_ids, task_weights = zip(*weights)
+        total = sum(task_weights)
+        probs = [w / total for w in task_weights]
+        return random.choices(task_ids, weights=probs, k=1)[0]
+    def weakness_tags(self) -> List[str]:
+        d = self.state.dict()
+        return [k for k, v in d.items() if v > 0.7]

app/engine/sandbox.py ADDED Viewed

	@@ -0,0 +1,1040 @@

+"""
+Layer 1 – Virtual File System.
+Stores all service codebases as in-memory strings keyed by
+(service, filename).  Every task starts from a clean snapshot of
+its own buggy codebase; edits accumulate on top of that snapshot.
+"""
+from __future__ import annotations
+import copy
+from typing import Dict, List, Optional, Tuple
+# ---------------------------------------------------------------------------
+# Buggy source code snapshots – one per task
+# Each task mutates only the files it needs; unchanged files are shared via
+# SHARED_FILES and merged in at reset() time.
+# ---------------------------------------------------------------------------
+SHARED_FILES: Dict[str, Dict[str, str]] = {
+    "ad_ranking": {
+        "utils.py": """\
+from typing import Dict, List
+def normalize_scores(ads: List[Dict]) -> List[Dict]:
+    if not ads:
+        return ads
+    max_score = max(ad['score'] for ad in ads)
+    min_score = min(ad['score'] for ad in ads)
+    score_range = max_score - min_score or 1.0
+    return [
+        {**ad, 'normalized_score': (ad['score'] - min_score) / score_range}
+        for ad in ads
+    ]
+def filter_by_budget(ads: List[Dict], daily_budget_cents: int) -> List[Dict]:
+    return [ad for ad in ads if ad.get('spend_today_cents', 0) < daily_budget_cents]
+def compute_roas(revenue: float, spend: float) -> float:
+    return revenue / spend if spend > 0 else 0.0
+""",
+        "models.py": """\
+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class Ad:
+    ad_id: str
+    campaign_id: str
+    category: str
+    target_age: str
+    clicks: int = 0
+    impressions: int = 0
+    spend_today_cents: int = 0
+    active: bool = True
+    score: float = 0.0
+@dataclass
+class UserContext:
+    user_id: str
+    interest: str
+    age_group: str
+    country: str
+""",
+    },
+    "capi_pipeline": {
+        "validator.py": """\
+from typing import Dict, Any
+REQUIRED_FIELDS = {'event_name', 'event_time', 'event_id'}
+def validate_event(event: Dict[str, Any]) -> Tuple[bool, str]:
+    missing = REQUIRED_FIELDS - set(event.keys())
+    if missing:
+        return False, f'Missing fields: {missing}'
+    if not isinstance(event.get('event_time'), (int, float)):
+        return False, 'event_time must be numeric'
+    return True, 'ok'
+""",
+    },
+    "whatsapp_sync": {
+        "models.py": """\
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class Message:
+    id: int
+    user_id: int
+    sender_id: int
+    content: str
+    timestamp: int
+    synced: bool = False
+    thread_id: Optional[int] = None
+""",
+    },
+}
+# ---------------------------------------------------------------------------
+# Task-specific buggy snapshots
+# ---------------------------------------------------------------------------
+TASK_SNAPSHOTS: Dict[int, Dict[str, Dict[str, str]]] = {
+    # ------------------------------------------------------------------
+    # Task 1 – Easy: Hallucinated attribute (ad.get_clicks())
+    # ------------------------------------------------------------------
+    1: {
+        "ad_ranking": {
+            "ranker.py": """\
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class AdRanker:
+    \"\"\"Scores and ranks candidate ads for a user.\"\"\"
+    def __init__(self, api_client):
+        self.api = api_client
+        self.model_version = "v2.3.1"
+        self._cache = {}
+    def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
+        ads = self.api.get_all_ads(user_id)
+        return [ad for ad in ads if ad.get('active', False)]
+    def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
+        scored = []
+        for ad in ads:
+            click_rate = ad.get_clicks() / max(ad.get('impressions', 1), 1)
+            relevance = self._compute_relevance(ad, user_context)
+            score = (click_rate * 0.4) + (relevance * 0.6)
+            scored.append({**ad, 'score': round(score, 4)})
+        return sorted(scored, key=lambda x: x['score'], reverse=True)
+    def _compute_relevance(self, ad: Dict, context: Dict) -> float:
+        category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
+        age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
+        return round((category_match + age_match) / 2.0, 4)
+    def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
+        candidates = self.fetch_candidate_ads(user_id)
+        if not candidates:
+            logger.warning(f"No candidates for user {user_id}")
+            return []
+        return self.score_ads(candidates, user_context)
+""",
+        },
+    },
+    # ------------------------------------------------------------------
+    # Task 2 – Medium: Silent timestamp corruption in CAPI → bad ROAS
+    # ------------------------------------------------------------------
+    2: {
+        "capi_pipeline": {
+            "transformer.py": """\
+import logging
+from typing import Dict, Any, List
+from datetime import datetime
+logger = logging.getLogger(__name__)
+class EventTransformer:
+    \"\"\"Transforms raw CAPI events into normalised format.\"\"\"
+    SUPPORTED_EVENTS = {
+        'Purchase', 'AddToCart', 'ViewContent', 'Lead', 'CompleteRegistration'
+    }
+    def __init__(self):
+        self._processed_count = 0
+    def transform(self, raw_event: Dict[str, Any]) -> Dict[str, Any]:
+        if raw_event.get('event_name') not in self.SUPPORTED_EVENTS:
+            logger.warning(f"Unknown event type: {raw_event.get('event_name')}")
+            return None
+        event_time = self._normalize_timestamp(raw_event.get('event_time', 0))
+        transformed = {
+            'event_id':    raw_event.get('event_id'),
+            'event_name':  raw_event.get('event_name'),
+            'event_time':  event_time,
+            'user_data':   self._hash_user_data(raw_event.get('user_data', {})),
+            'custom_data': raw_event.get('custom_data', {}),
+            'processed_at': int(datetime.utcnow().timestamp()),
+        }
+        self._processed_count += 1
+        return transformed
+    def _normalize_timestamp(self, ts: Any) -> int:
+        \"\"\"Normalise event timestamp to Unix seconds.\"\"\"
+        ts = int(ts)
+        # BUG: threshold is 1_000_000_000 (10 digits) instead of
+        #      1_000_000_000_000 (13 digits for milliseconds).
+        #      A normal unix-second timestamp like 1_700_000_000 passes the
+        #      condition and gets divided by 1000 → year ~1970+20 days.
+        if ts > 1_000_000_000:
+            return ts // 1000
+        return ts
+    def _hash_user_data(self, user_data: Dict) -> Dict:
+        import hashlib
+        hashed = {}
+        for key, val in user_data.items():
+            if key in ('email', 'phone', 'fn', 'ln'):
+                hashed[key] = hashlib.sha256(
+                    str(val).lower().encode()
+                ).hexdigest()
+            else:
+                hashed[key] = val
+        return hashed
+    def batch_transform(self, events: List[Dict]) -> List[Dict]:
+        return [t for e in events if (t := self.transform(e)) is not None]
+""",
+            "ingestor.py": """\
+import logging
+from typing import Dict, Any
+from .transformer import EventTransformer
+logger = logging.getLogger(__name__)
+class EventIngestor:
+    \"\"\"Ingests and validates CAPI event payloads.\"\"\"
+    def __init__(self, transformer: EventTransformer):
+        self.transformer = transformer
+        self._event_buffer = []
+    def ingest(self, raw_payload: Dict[str, Any]) -> Dict[str, Any]:
+        try:
+            events = raw_payload.get('data', [])
+            if not events:
+                return {'status': 'error', 'message': 'No events in payload'}
+            transformed = self.transformer.batch_transform(events)
+            self._event_buffer.extend(transformed)
+            return {'status': 'ok', 'processed': len(transformed)}
+        except Exception as e:
+            logger.error(f"Ingest failed: {e}", exc_info=True)
+            return {'status': 'error', 'message': str(e)}
+    def flush(self) -> int:
+        count = len(self._event_buffer)
+        self._event_buffer.clear()
+        logger.info(f"Flushed {count} events")
+        return count
+""",
+        },
+        "ad_ranking": {
+            "ranker.py": """\
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class AdRanker:
+    \"\"\"Scores and ranks candidate ads for a user.\"\"\"
+    def __init__(self, api_client):
+        self.api = api_client
+        self.model_version = "v2.3.1"
+    def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
+        ads = self.api.get_all_ads(user_id)
+        return [ad for ad in ads if ad.get('active', False)]
+    def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
+        scored = []
+        for ad in ads:
+            click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)
+            relevance = self._compute_relevance(ad, user_context)
+            score = (click_rate * 0.4) + (relevance * 0.6)
+            scored.append({**ad, 'score': round(score, 4)})
+        return sorted(scored, key=lambda x: x['score'], reverse=True)
+    def _compute_relevance(self, ad: Dict, context: Dict) -> float:
+        category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
+        age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
+        return round((category_match + age_match) / 2.0, 4)
+    def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
+        candidates = self.fetch_candidate_ads(user_id)
+        if not candidates:
+            logger.warning(f"No candidates for user {user_id}")
+            return []
+        return self.score_ads(candidates, user_context)
+""",
+        },
+    },
+    # ------------------------------------------------------------------
+    # Task 3 – Medium-Hard: DB connection leak in WhatsApp sync handler
+    # ------------------------------------------------------------------
+    3: {
+        "whatsapp_sync": {
+            "handler.py": """\
+import asyncio
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class MessageSyncHandler:
+    \"\"\"Handles real-time WhatsApp message synchronisation.\"\"\"
+    def __init__(self, db_pool, message_queue):
+        self.db_pool = db_pool
+        self.queue = message_queue
+        self._sync_count = 0
+    async def sync_user_messages(self, user_id: str) -> List[Dict]:
+        \"\"\"Fetch and mark-as-synced all pending messages for a user.\"\"\"
+        conn = await self.db_pool.acquire()
+        try:
+            messages = await conn.fetch(
+                "SELECT id, content, sender_id, timestamp "
+                "FROM messages WHERE user_id = $1 AND synced = FALSE "
+                "ORDER BY timestamp",
+                user_id,
+            )
+            processed = []
+            for msg in messages:
+                await conn.execute(
+                    "UPDATE messages SET synced = TRUE WHERE id = $1",
+                    msg['id'],
+                )
+                processed.append(dict(msg))
+            self._sync_count += len(processed)
+            return processed
+        except Exception as e:
+            logger.error(f"Sync failed for user {user_id}: {e}")
+            raise
+        # BUG: missing `finally: await self.db_pool.release(conn)`
+        # Under load the pool exhausts → all sync requests hang indefinitely.
+    async def process_queue(self, batch_size: int = 50) -> int:
+        processed = 0
+        while processed < batch_size:
+            try:
+                user_id = await asyncio.wait_for(
+                    self.queue.get(), timeout=1.0
+                )
+                await self.sync_user_messages(user_id)
+                processed += 1
+            except asyncio.TimeoutError:
+                break
+        return processed
+""",
+            "db.py": """\
+import logging
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+MIGRATIONS: List[Dict] = [
+    {
+        "version": "001",
+        "description": "Create messages table",
+        "up": (
+            "CREATE TABLE messages ("
+            "    id SERIAL PRIMARY KEY,"
+            "    user_id INTEGER NOT NULL,"
+            "    content TEXT,"
+            "    sender_id INTEGER,"
+            "    timestamp BIGINT,"
+            "    synced BOOLEAN DEFAULT FALSE"
+            ");"
+        ),
+    },
+]
+class MigrationRunner:
+    def __init__(self, db_conn):
+        self.conn = db_conn
+        self._applied: List[str] = []
+    async def apply(self, migration: Dict) -> bool:
+        await self.conn.execute(migration['up'])
+        self._applied.append(migration['version'])
+        logger.info(f"Applied migration {migration['version']}")
+        return True
+""",
+        },
+    },
+    # ------------------------------------------------------------------
+    # Task 4 – Hard: Red-herring cascade from a bad DB migration (003)
+    # ------------------------------------------------------------------
+    4: {
+        "whatsapp_sync": {
+            "db.py": """\
+import logging
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+# Migration 003 introduces a circular FK:
+#   message_threads.parent_message_id → messages.id
+#   messages.thread_id                → message_threads.id
+# PostgreSQL refuses the self-referential constraint during ALTER TABLE,
+# causing FK violation errors that cascade to all consumers of both tables.
+MIGRATIONS: List[Dict] = [
+    {
+        "version": "001",
+        "description": "Create messages table",
+        "up": (
+            "CREATE TABLE IF NOT EXISTS messages ("
+            "    id SERIAL PRIMARY KEY,"
+            "    user_id INTEGER NOT NULL,"
+            "    content TEXT,"
+            "    sender_id INTEGER,"
+            "    timestamp BIGINT,"
+            "    synced BOOLEAN DEFAULT FALSE"
+            ");"
+        ),
+    },
+    {
+        "version": "002",
+        "description": "Add user preferences",
+        "up": (
+            "CREATE TABLE IF NOT EXISTS user_preferences ("
+            "    id SERIAL PRIMARY KEY,"
+            "    user_id INTEGER NOT NULL,"
+            "    notification_enabled BOOLEAN DEFAULT TRUE,"
+            "    sync_frequency INTEGER DEFAULT 30"
+            ");"
+        ),
+    },
+    {
+        "version": "003",
+        "description": "Add message threads with back-reference",
+        "up": (
+            "CREATE TABLE IF NOT EXISTS message_threads ("
+            "    id SERIAL PRIMARY KEY,"
+            "    parent_message_id INTEGER REFERENCES messages(id) ON DELETE CASCADE,"
+            "    participant_ids INTEGER[] NOT NULL,"
+            "    created_at BIGINT"
+            ");"
+            "ALTER TABLE messages"
+            "    ADD COLUMN thread_id INTEGER REFERENCES message_threads(id);"
+        ),
+        # BUG: circular FK — messages → message_threads → messages
+        # Fix: remove the ALTER TABLE line (messages should NOT reference threads)
+    },
+]
+class MigrationRunner:
+    def __init__(self, db_conn):
+        self.conn = db_conn
+        self._applied: List[str] = []
+    async def apply(self, migration: Dict) -> bool:
+        await self.conn.execute(migration['up'])
+        self._applied.append(migration['version'])
+        logger.info(f"Applied migration {migration['version']}: {migration['description']}")
+        return True
+    async def rollback_version(self, version: str) -> bool:
+        logger.warning(f"Rolling back migration {version}")
+        self._applied = [v for v in self._applied if v != version]
+        return True
+    async def run_all(self):
+        for migration in MIGRATIONS:
+            await self.apply(migration)
+""",
+            "handler.py": """\
+import asyncio
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class MessageSyncHandler:
+    def __init__(self, db_pool, message_queue):
+        self.db_pool = db_pool
+        self.queue = message_queue
+        self._sync_count = 0
+    async def sync_user_messages(self, user_id: str) -> List[Dict]:
+        conn = await self.db_pool.acquire()
+        try:
+            messages = await conn.fetch(
+                "SELECT id, content, sender_id, timestamp "
+                "FROM messages WHERE user_id = $1 AND synced = FALSE "
+                "ORDER BY timestamp",
+                user_id,
+            )
+            processed = []
+            for msg in messages:
+                await conn.execute(
+                    "UPDATE messages SET synced = TRUE WHERE id = $1",
+                    msg['id'],
+                )
+                processed.append(dict(msg))
+            self._sync_count += len(processed)
+            return processed
+        except Exception as e:
+            logger.error(f"Sync failed for user {user_id}: {e}")
+            raise
+        finally:
+            await self.db_pool.release(conn)
+    async def process_queue(self, batch_size: int = 50) -> int:
+        processed = 0
+        while processed < batch_size:
+            try:
+                user_id = await asyncio.wait_for(
+                    self.queue.get(), timeout=1.0
+                )
+                await self.sync_user_messages(user_id)
+                processed += 1
+            except asyncio.TimeoutError:
+                break
+        return processed
+""",
+        },
+        "capi_pipeline": {
+            "ingestor.py": """\
+import logging
+from typing import Dict, Any
+from .transformer import EventTransformer
+logger = logging.getLogger(__name__)
+class EventIngestor:
+    def __init__(self, transformer: EventTransformer):
+        self.transformer = transformer
+        self._event_buffer = []
+    def ingest(self, raw_payload: Dict[str, Any]) -> Dict[str, Any]:
+        try:
+            events = raw_payload.get('data', [])
+            if not events:
+                return {'status': 'error', 'message': 'No events in payload'}
+            transformed = self.transformer.batch_transform(events)
+            self._event_buffer.extend(transformed)
+            return {'status': 'ok', 'processed': len(transformed)}
+        except Exception as e:
+            logger.error(f"Ingest failed: {e}", exc_info=True)
+            return {'status': 'error', 'message': str(e)}
+    def flush(self) -> int:
+        count = len(self._event_buffer)
+        self._event_buffer.clear()
+        return count
+""",
+            "transformer.py": """\
+import logging
+from typing import Dict, Any, List
+from datetime import datetime
+logger = logging.getLogger(__name__)
+class EventTransformer:
+    SUPPORTED_EVENTS = {
+        'Purchase', 'AddToCart', 'ViewContent', 'Lead', 'CompleteRegistration'
+    }
+    def __init__(self):
+        self._processed_count = 0
+    def transform(self, raw_event: Dict[str, Any]) -> Dict[str, Any]:
+        if raw_event.get('event_name') not in self.SUPPORTED_EVENTS:
+            return None
+        event_time = self._normalize_timestamp(raw_event.get('event_time', 0))
+        transformed = {
+            'event_id':    raw_event.get('event_id'),
+            'event_name':  raw_event.get('event_name'),
+            'event_time':  event_time,
+            'user_data':   raw_event.get('user_data', {}),
+            'custom_data': raw_event.get('custom_data', {}),
+            'processed_at': int(datetime.utcnow().timestamp()),
+        }
+        self._processed_count += 1
+        return transformed
+    def _normalize_timestamp(self, ts: Any) -> int:
+        ts = int(ts)
+        if ts > 1_000_000_000_000:
+            return ts // 1000
+        return ts
+    def batch_transform(self, events: List[Dict]) -> List[Dict]:
+        return [t for e in events if (t := self.transform(e)) is not None]
+""",
+        },
+        "ad_ranking": {
+            "ranker.py": """\
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class AdRanker:
+    def __init__(self, api_client):
+        self.api = api_client
+        self.model_version = "v2.3.1"
+    def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
+        ads = self.api.get_all_ads(user_id)
+        return [ad for ad in ads if ad.get('active', False)]
+    def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
+        scored = []
+        for ad in ads:
+            click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)
+            relevance = self._compute_relevance(ad, user_context)
+            score = (click_rate * 0.4) + (relevance * 0.6)
+            scored.append({**ad, 'score': round(score, 4)})
+        return sorted(scored, key=lambda x: x['score'], reverse=True)
+    def _compute_relevance(self, ad: Dict, context: Dict) -> float:
+        category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
+        age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
+        return round((category_match + age_match) / 2.0, 4)
+    def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
+        candidates = self.fetch_candidate_ads(user_id)
+        if not candidates:
+            return []
+        return self.score_ads(candidates, user_context)
+""",
+        },
+    },
+    # ------------------------------------------------------------------
+    # Task 5 – Hard: PII data-leak via DEBUG_MODE=True in production
+    # ------------------------------------------------------------------
+    5: {
+        "capi_pipeline": {
+            "ingestor.py": """\
+import logging
+from typing import Dict, Any
+from .transformer import EventTransformer
+logger = logging.getLogger(__name__)
+DEBUG_MODE = True   # BUG: must be False in production – leaks raw user PII
+class EventIngestor:
+    \"\"\"Ingests and validates CAPI event payloads.\"\"\"
+    def __init__(self, transformer: EventTransformer):
+        self.transformer = transformer
+        self._event_buffer = []
+    def ingest(self, raw_payload: Dict[str, Any]) -> Dict[str, Any]:
+        try:
+            events = raw_payload.get('data', [])
+            if not events:
+                return {'status': 'error', 'message': 'No events in payload'}
+            transformed = self.transformer.batch_transform(events)
+            self._event_buffer.extend(transformed)
+            if DEBUG_MODE:
+                # SECURITY BUG: exposes raw PII (emails, phone numbers) in the
+                # HTTP response – visible in CDN logs, browser network tabs, etc.
+                return {
+                    'status': 'ok',
+                    'processed': len(transformed),
+                    'debug_data': {
+                        'raw_payload':  raw_payload,
+                        'user_emails':  [e.get('user_data', {}) for e in events],
+                        'buffer_state': self._event_buffer,
+                    },
+                }
+            return {'status': 'ok', 'processed': len(transformed)}
+        except Exception as e:
+            logger.error(f"Ingest failed: {e}", exc_info=True)
+            return {'status': 'error', 'message': str(e)}
+    def flush(self) -> int:
+        count = len(self._event_buffer)
+        self._event_buffer.clear()
+        logger.info(f"Flushed {count} events")
+        return count
+""",
+            "transformer.py": """\
+import logging
+from typing import Dict, Any, List
+from datetime import datetime
+logger = logging.getLogger(__name__)
+class EventTransformer:
+    SUPPORTED_EVENTS = {
+        'Purchase', 'AddToCart', 'ViewContent', 'Lead', 'CompleteRegistration'
+    }
+    def __init__(self):
+        self._processed_count = 0
+    def transform(self, raw_event: Dict[str, Any]) -> Dict[str, Any]:
+        if raw_event.get('event_name') not in self.SUPPORTED_EVENTS:
+            return None
+        event_time = self._normalize_timestamp(raw_event.get('event_time', 0))
+        transformed = {
+            'event_id':    raw_event.get('event_id'),
+            'event_name':  raw_event.get('event_name'),
+            'event_time':  event_time,
+            'user_data':   self._hash_user_data(raw_event.get('user_data', {})),
+            'custom_data': raw_event.get('custom_data', {}),
+            'processed_at': int(datetime.utcnow().timestamp()),
+        }
+        self._processed_count += 1
+        return transformed
+    def _normalize_timestamp(self, ts: Any) -> int:
+        ts = int(ts)
+        if ts > 1_000_000_000_000:
+            return ts // 1000
+        return ts
+    def _hash_user_data(self, user_data: Dict) -> Dict:
+        import hashlib
+        hashed = {}
+        for key, val in user_data.items():
+            if key in ('email', 'phone', 'fn', 'ln'):
+                hashed[key] = hashlib.sha256(
+                    str(val).lower().encode()
+                ).hexdigest()
+            else:
+                hashed[key] = val
+        return hashed
+    def batch_transform(self, events: List[Dict]) -> List[Dict]:
+        return [t for e in events if (t := self.transform(e)) is not None]
+""",
+        },
+        "ad_ranking": {
+            "ranker.py": """\
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class AdRanker:
+    def __init__(self, api_client):
+        self.api = api_client
+        self.model_version = "v2.3.1"
+    def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
+        ads = self.api.get_all_ads(user_id)
+        return [ad for ad in ads if ad.get('active', False)]
+    def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
+        scored = []
+        for ad in ads:
+            click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)
+            relevance = self._compute_relevance(ad, user_context)
+            score = (click_rate * 0.4) + (relevance * 0.6)
+            scored.append({**ad, 'score': round(score, 4)})
+        return sorted(scored, key=lambda x: x['score'], reverse=True)
+    def _compute_relevance(self, ad: Dict, context: Dict) -> float:
+        category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
+        age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
+        return round((category_match + age_match) / 2.0, 4)
+    def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
+        candidates = self.fetch_candidate_ads(user_id)
+        if not candidates:
+            return []
+        return self.score_ads(candidates, user_context)
+""",
+        },
+        "whatsapp_sync": {
+            "handler.py": """\
+import asyncio
+import logging
+from typing import List, Dict
+logger = logging.getLogger(__name__)
+class MessageSyncHandler:
+    def __init__(self, db_pool, message_queue):
+        self.db_pool = db_pool
+        self.queue = message_queue
+        self._sync_count = 0
+    async def sync_user_messages(self, user_id: str) -> List[Dict]:
+        conn = await self.db_pool.acquire()
+        try:
+            messages = await conn.fetch(
+                "SELECT id, content, sender_id, timestamp "
+                "FROM messages WHERE user_id = $1 AND synced = FALSE "
+                "ORDER BY timestamp",
+                user_id,
+            )
+            processed = []
+            for msg in messages:
+                await conn.execute(
+                    "UPDATE messages SET synced = TRUE WHERE id = $1",
+                    msg['id'],
+                )
+                processed.append(dict(msg))
+            self._sync_count += len(processed)
+            return processed
+        except Exception as e:
+            logger.error(f"Sync failed for user {user_id}: {e}")
+            raise
+        finally:
+            await self.db_pool.release(conn)
+    async def process_queue(self, batch_size: int = 50) -> int:
+        processed = 0
+        while processed < batch_size:
+            try:
+                user_id = await asyncio.wait_for(
+                    self.queue.get(), timeout=1.0
+                )
+                await self.sync_user_messages(user_id)
+                processed += 1
+            except asyncio.TimeoutError:
+                break
+        return processed
+""",
+            "db.py": """\
+import logging
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+MIGRATIONS: List[Dict] = [
+    {
+        "version": "001",
+        "description": "Create messages table",
+        "up": (
+            "CREATE TABLE IF NOT EXISTS messages ("
+            "    id SERIAL PRIMARY KEY,"
+            "    user_id INTEGER NOT NULL,"
+            "    content TEXT,"
+            "    sender_id INTEGER,"
+            "    timestamp BIGINT,"
+            "    synced BOOLEAN DEFAULT FALSE"
+            ");"
+        ),
+    },
+]
+class MigrationRunner:
+    def __init__(self, db_conn):
+        self.conn = db_conn
+        self._applied: List[str] = []
+    async def apply(self, migration: Dict) -> bool:
+        await self.conn.execute(migration['up'])
+        self._applied.append(migration['version'])
+        return True
+""",
+        },
+    },
+}
+# ---------------------------------------------------------------------------
+# VirtualFileSystem
+# ---------------------------------------------------------------------------
+class EditRecord:
+    __slots__ = ("step", "service", "filename", "line_idx", "old_code", "new_code")
+    def __init__(self, step, service, filename, line_idx, old_code, new_code):
+        self.step = step
+        self.service = service
+        self.filename = filename
+        self.line_idx = line_idx
+        self.old_code = old_code
+        self.new_code = new_code
+    def to_dict(self):
+        return {
+            "step": self.step,
+            "service": self.service,
+            "filename": self.filename,
+            "line_number": self.line_idx + 1,
+            "old_code": self.old_code,
+            "new_code": self.new_code,
+        }
+class VirtualFileSystem:
+    """In-memory multi-service file system with history tracking."""
+    def __init__(self):
+        self._files: Dict[str, Dict[str, str]] = {}
+        self._history: List[EditRecord] = []
+        self._task_id: int = 0
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def reset(self, task_id: int) -> None:
+        """Load the buggy snapshot for a specific task."""
+        self._task_id = task_id
+        self._history.clear()
+        snapshot = TASK_SNAPSHOTS.get(task_id, {})
+        # Start from shared base, then overlay task-specific files
+        merged: Dict[str, Dict[str, str]] = {}
+        for service, files in SHARED_FILES.items():
+            merged[service] = dict(files)
+        for service, files in snapshot.items():
+            if service not in merged:
+                merged[service] = {}
+            merged[service].update(files)
+        self._files = merged
+    # ------------------------------------------------------------------
+    # Read
+    # ------------------------------------------------------------------
+    def list_files(self, service: str) -> List[str]:
+        return sorted(self._files.get(service, {}).keys())
+    def list_services(self) -> List[str]:
+        return sorted(self._files.keys())
+    def read_file(self, service: str, filename: str) -> Tuple[bool, str]:
+        """Return (found, content)."""
+        content = self._files.get(service, {}).get(filename)
+        if content is None:
+            return False, f"File not found: {service}/{filename}"
+        return True, content
+    def get_file_lines(self, service: str, filename: str) -> Optional[List[str]]:
+        found, content = self.read_file(service, filename)
+        if not found:
+            return None
+        return content.splitlines()
+    # ------------------------------------------------------------------
+    # Write
+    # ------------------------------------------------------------------
+    def edit_line(
+        self,
+        service: str,
+        filename: str,
+        line_number: int,      # 1-based
+        new_code: str,
+        step: int = 0,
+    ) -> Tuple[bool, str]:
+        """Replace a single line (1-based). Returns (success, message)."""
+        lines = self.get_file_lines(service, filename)
+        if lines is None:
+            return False, f"File not found: {service}/{filename}"
+        idx = line_number - 1
+        if not (0 <= idx < len(lines)):
+            return False, f"Line {line_number} out of range (file has {len(lines)} lines)"
+        old_code = lines[idx]
+        lines[idx] = new_code
+        self._files[service][filename] = "\n".join(lines)
+        self._history.append(
+            EditRecord(step, service, filename, idx, old_code, new_code)
+        )
+        return True, "ok"
+    # ------------------------------------------------------------------
+    # History / blame
+    # ------------------------------------------------------------------
+    def get_edit_history(
+        self,
+        service: Optional[str] = None,
+        filename: Optional[str] = None,
+    ) -> List[dict]:
+        records = self._history
+        if service:
+            records = [r for r in records if r.service == service]
+        if filename:
+            records = [r for r in records if r.filename == filename]
+        return [r.to_dict() for r in records]
+    def git_blame(self, service: str, filename: str, line_number: int) -> str:
+        """Return the last edit record for a specific line, or 'AI-generated' if untouched."""
+        idx = line_number - 1
+        matching = [
+            r for r in reversed(self._history)
+            if r.service == service and r.filename == filename and r.line_idx == idx
+        ]
+        if matching:
+            r = matching[0]
+            return (
+                f"Step {r.step}: agent changed line {line_number} in "
+                f"{service}/{filename}\n"
+                f"  - {r.old_code!r}\n"
+                f"  + {r.new_code!r}"
+            )
+        return (
+            f"Line {line_number} in {service}/{filename} was last modified by: "
+            f"Junior AI code-gen bot (commit a3f91b2, 2026-04-23 02:14 UTC)"
+        )
+    def build_git_diff(self) -> Optional[str]:
+        if not self._history:
+            return None
+        lines = [f"--- Task {self._task_id} working diff ---"]
+        for r in self._history:
+            lines.append(
+                f"@@ {r.service}/{r.filename} line {r.line_idx + 1} @@\n"
+                f"-{r.old_code}\n"
+                f"+{r.new_code}"
+            )
+        return "\n".join(lines)

app/main.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Meta-SRE FastAPI Server – OpenEnv Standard API.
+Implements the OpenEnv contract exactly:
+  POST /reset  → Observation
+  POST /step   → (observation, reward, done, info)
+  GET  /state  → Observation
+  GET  /grade  → EpisodeResult
+  GET  /tools  → tool specs (JSON Schema per tool)
+  GET  /tasks  → task definitions
+  GET  /health → liveness probe
+The /env/* routes are strict OpenEnv aliases used by openenv_client.connect().
+"""
+from __future__ import annotations
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Any, Dict, Optional
+from app.engine.manager import EpisodeManager, TASK_DEFINITIONS, DifficultyController
+from app.models import Observation, ActionResult, EpisodeResult
+from app.tools.definitions import TOOL_SPECS
+# ---------------------------------------------------------------------------
+# App setup
+# ---------------------------------------------------------------------------
+app = FastAPI(
+    title="Meta-SRE",
+    description=(
+        "OpenEnv environment: train LLM agents to act as Senior Site Reliability Engineers "
+        "debugging realistic Meta production incidents."
+    ),
+    version="1.0.0",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Single global episode manager (stateful server)
+_dc = DifficultyController()
+_episode = EpisodeManager(difficulty_controller=_dc)
+_started = False
+# ---------------------------------------------------------------------------
+# Request / response models
+# ---------------------------------------------------------------------------
+class ResetRequest(BaseModel):
+    task_id: Optional[int] = None   # 1-5; None = difficulty-controller picks
+class StepRequest(BaseModel):
+    tool:   str
+    params: Dict[str, Any] = {}
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+@app.get("/health")
+def health():
+    return {"status": "ok", "version": "1.0.0"}
+@app.get("/tools")
+def list_tools():
+    return {"tools": TOOL_SPECS, "count": len(TOOL_SPECS)}
+@app.get("/tasks")
+def list_tasks():
+    return {"tasks": TASK_DEFINITIONS}
+@app.post("/reset", response_model=Observation)
+def reset(req: ResetRequest = ResetRequest()):
+    global _started
+    _started = True
+    obs = _episode.reset(task_id=req.task_id)
+    return obs
+@app.post("/step", response_model=ActionResult)
+def step(req: StepRequest):
+    global _started
+    if not _started:
+        raise HTTPException(
+            status_code=400,
+            detail="Episode not started. Call POST /reset first."
+        )
+    try:
+        result = _episode.step(tool=req.tool, params=req.params)
+        return result
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.get("/state", response_model=Observation)
+def get_state():
+    if not _started:
+        raise HTTPException(
+            status_code=400,
+            detail="Episode not started. Call POST /reset first."
+        )
+    return _episode._build_observation()
+@app.get("/grade", response_model=EpisodeResult)
+def grade():
+    if not _started:
+        raise HTTPException(
+            status_code=400,
+            detail="Episode not started. Call POST /reset first."
+        )
+    return _episode.get_episode_result()
+# ---------------------------------------------------------------------------
+# OpenEnv compatibility shim (env.reset / env.step / env.grade)
+# ---------------------------------------------------------------------------
+@app.post("/env/reset")
+def env_reset(req: ResetRequest = ResetRequest()):
+    """OpenEnv spec alias for /reset."""
+    return reset(req)
+@app.post("/env/step")
+def env_step(req: StepRequest):
+    """
+    OpenEnv standard step — returns the canonical 4-tuple:
+      (observation, reward, done, info)
+    This is what openenv_client.connect().step() unpacks.
+    """
+    result: ActionResult = step(req)
+    return {
+        "observation": result.observation,
+        "reward":      result.reward_delta,
+        "done":        result.done,
+        "info": {
+            "tool":            result.tool,
+            "output":          result.output,
+            "episode_id":      _episode._incident_id,
+            "step":            _episode._step,
+            "budget_remaining": max(0, _episode._build_observation().budget_remaining),
+        },
+    }
+# ---------------------------------------------------------------------------
+# Dev entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)

app/models.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from __future__ import annotations
+from pydantic import BaseModel, Field
+from typing import Dict, List, Optional, Any
+class ServiceMetrics(BaseModel):
+    cpu_percent: float
+    memory_mb: float
+    error_rate: float           # errors per second
+    p99_latency_ms: float
+    request_queue: int
+    last_deploy: str
+    status: str                 # healthy | degraded | critical | down
+class Alert(BaseModel):
+    alert_id: str
+    severity: str               # P0 | P1 | P2
+    service: str
+    message: str
+    triggered_at_step: int
+    is_red_herring: bool = False
+class FileView(BaseModel):
+    service: str
+    filename: str
+    content: str
+    total_lines: int
+class EditRecord(BaseModel):
+    step: int
+    service: str
+    filename: str
+    line_number: int
+    old_code: str
+    new_code: str
+class TestResult(BaseModel):
+    suite: str                  # unit | integration | load | security
+    passed: bool
+    output: str
+    errors: List[str] = Field(default_factory=list)
+class Observation(BaseModel):
+    step: int
+    incident_id: str
+    system_metrics: Dict[str, ServiceMetrics]
+    active_alerts: List[Alert]
+    open_file: Optional[FileView] = None
+    terminal_output: str
+    git_diff: Optional[str] = None
+    dependency_graph: Dict[str, List[str]]
+    sre_memory: List[str] = Field(default_factory=list)
+    budget_remaining: int
+    task_id: int
+    task_description: str
+class ActionRequest(BaseModel):
+    tool: str
+    params: Dict[str, Any] = Field(default_factory=dict)
+class ActionResult(BaseModel):
+    tool: str
+    output: Any
+    reward_delta: float
+    done: bool
+    observation: Observation
+class EpisodeResult(BaseModel):
+    incident_id: str
+    task_id: int
+    steps_taken: int
+    total_reward: float
+    normalized_score: float     # 0.0 – 1.0
+    tests_passed: bool
+    incident_report_accuracy: float
+    fixed_within_sla: bool
+    tool_call_log: List[Dict[str, Any]]
+    weakness_tags: List[str]    # for DifficultyController
+class IncidentReport(BaseModel):
+    root_cause: str
+    fix_applied: str
+    services_affected: List[str]
+    severity_classification: str    # P0 | P1 | P2
+class DifficultyState(BaseModel):
+    async_bugs: float = 0.5
+    data_corruption: float = 0.5
+    security_bugs: float = 0.5
+    cascading_failures: float = 0.5
+    red_herrings: float = 0.5

app/services/__init__.py ADDED Viewed

File without changes

app/services/ad_ranking/__init__.py ADDED Viewed

File without changes

app/services/capi_pipeline/__init__.py ADDED Viewed

File without changes

app/services/whatsapp_sync/__init__.py ADDED Viewed

File without changes

app/tools/__init__.py ADDED Viewed

File without changes

app/tools/definitions.py ADDED Viewed

	@@ -0,0 +1,761 @@

+"""
+All 10 agent tools — implemented as plain Python functions wrapped in ToolDispatcher.
+Each tool returns (reward_delta: float, done: bool, output: Any).
+The EpisodeManager calls ToolDispatcher.dispatch(tool, params).
+"""
+from __future__ import annotations
+import re
+from typing import TYPE_CHECKING, Any, Dict, Tuple
+if TYPE_CHECKING:
+    from app.engine.manager import EpisodeManager
+# ---------------------------------------------------------------------------
+# Schema exposed to the LLM (OpenEnv tool_spec format)
+# ---------------------------------------------------------------------------
+TOOL_SPECS = [
+    {
+        "name": "view_file",
+        "description": (
+            "Read the contents of a file in a service codebase. "
+            "Use this BEFORE editing to understand the code."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service":  {"type": "string", "enum": ["ad_ranking", "capi_pipeline", "whatsapp_sync"]},
+                "filename": {"type": "string", "description": "e.g. ranker.py"},
+            },
+            "required": ["service", "filename"],
+        },
+    },
+    {
+        "name": "edit_line",
+        "description": (
+            "Replace a single line in a file. SURGICAL edits only — "
+            "do NOT rewrite whole functions. One line at a time."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service":     {"type": "string"},
+                "filename":    {"type": "string"},
+                "line_number": {"type": "integer", "description": "1-based line number"},
+                "new_code":    {"type": "string", "description": "Replacement line (preserve indentation)"},
+            },
+            "required": ["service", "filename", "line_number", "new_code"],
+        },
+    },
+    {
+        "name": "run_tests",
+        "description": (
+            "Execute a test suite for a service. "
+            "suite options: 'unit' (fast, 1 step), "
+            "'integration' (2 steps), 'load' (3 steps), 'security' (2 steps)."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service": {"type": "string"},
+                "suite":   {"type": "string", "enum": ["unit", "integration", "load", "security"],
+                            "default": "unit"},
+            },
+            "required": ["service"],
+        },
+    },
+    {
+        "name": "check_dependency",
+        "description": "Show the data-flow relationship between two services.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service_a": {"type": "string"},
+                "service_b": {"type": "string"},
+            },
+            "required": ["service_a", "service_b"],
+        },
+    },
+    {
+        "name": "read_logs",
+        "description": "Pull recent logs for a service filtered by log level.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service":      {"type": "string"},
+                "log_level":    {"type": "string", "enum": ["ERROR", "WARN", "INFO", "DEBUG"],
+                                 "default": "ERROR"},
+                "last_n_lines": {"type": "integer", "default": 20},
+            },
+            "required": ["service"],
+        },
+    },
+    {
+        "name": "git_blame",
+        "description": "Find who/what last changed a specific line — reveals AI-generated code.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service":     {"type": "string"},
+                "filename":    {"type": "string"},
+                "line_number": {"type": "integer"},
+            },
+            "required": ["service", "filename", "line_number"],
+        },
+    },
+    {
+        "name": "rollback",
+        "description": (
+            "Roll back a service's database migration by version string. "
+            "HIGH COST — use only when a bad migration is the root cause."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service": {"type": "string"},
+                "version": {"type": "string", "description": "Migration version, e.g. '003'"},
+            },
+            "required": ["service", "version"],
+        },
+    },
+    {
+        "name": "query_metrics_history",
+        "description": "Show how a metric changed over time — reveals when the problem started.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "service":    {"type": "string"},
+                "metric":     {"type": "string",
+                               "enum": ["cpu_percent", "memory_mb", "error_rate",
+                                        "p99_latency_ms", "request_queue"]},
+                "hours_back": {"type": "integer", "default": 6},
+            },
+            "required": ["service", "metric"],
+        },
+    },
+    {
+        "name": "ask_senior_sre",
+        "description": (
+            "Ask the on-call Senior SRE for a hint. "
+            "Costs 2 reward steps. Use when genuinely stuck."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "question": {"type": "string"},
+            },
+            "required": ["question"],
+        },
+    },
+    {
+        "name": "write_incident_report",
+        "description": (
+            "Close the incident by submitting a post-mortem report. "
+            "MUST be called after fixing the bug to end the episode."
+        ),
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "root_cause":             {"type": "string"},
+                "fix_applied":            {"type": "string"},
+                "services_affected":      {"type": "array", "items": {"type": "string"}},
+                "severity_classification": {"type": "string", "enum": ["P0", "P1", "P2"]},
+            },
+            "required": ["root_cause", "fix_applied", "services_affected", "severity_classification"],
+        },
+    },
+]
+# ---------------------------------------------------------------------------
+# Per-task contextual log data
+# ---------------------------------------------------------------------------
+_TASK_LOGS: Dict[int, Dict[str, Dict[str, str]]] = {
+    1: {
+        "ad_ranking": {
+            "ERROR": (
+                "[2026-04-24 03:14:21] ERROR ad_ranking.ranker: "
+                "AttributeError: 'dict' object has no attribute 'get_clicks'\n"
+                "  File ranker.py, line 22, in score_ads\n"
+                "  click_rate = ad.get_clicks() / max(ad.get('impressions', 1), 1)\n"
+                "[2026-04-24 03:14:22] ERROR ad_ranking.ranker: same error (x487 in last 60s)"
+            ),
+            "DEBUG": (
+                "[2026-04-24 03:14:20] DEBUG ad_ranking.ranker: fetch_candidate_ads returned 12 ads\n"
+                "[2026-04-24 03:14:21] DEBUG ad_ranking.ranker: entering score_ads with 12 ads\n"
+                "[2026-04-24 03:14:21] DEBUG ad_ranking.ranker: processing ad_id=ad_001 — CRASH"
+            ),
+        },
+    },
+    2: {
+        "capi_pipeline": {
+            "WARN": (
+                "[2026-04-24 03:00:05] WARN capi_pipeline.transformer: "
+                "event_time 1700000000 converted to 1700000 — check threshold\n"
+                "[2026-04-24 03:00:05] WARN capi_pipeline.transformer: "
+                "event_time 1745392000 converted to 1745392 — data from 1970-01-20"
+            ),
+            "DEBUG": (
+                "[2026-04-24 02:14:03] DEBUG capi_pipeline.transformer: "
+                "_normalize_timestamp called with ts=1700000000\n"
+                "[2026-04-24 02:14:03] DEBUG capi_pipeline.transformer: "
+                "ts > 1_000_000_000 → True, returning ts // 1000 = 1700000\n"
+                "[2026-04-24 02:14:03] DEBUG capi_pipeline.transformer: "
+                "EXPECTED: ts > 1_000_000_000_000 for millisecond timestamps"
+            ),
+            "ERROR": "[2026-04-24 03:00:00] INFO capi_pipeline: No errors — pipeline healthy",
+        },
+        "ad_ranking": {
+            "WARN": (
+                "[2026-04-24 03:01:00] WARN ad_ranking.ranker: "
+                "ROAS attribution window: events from 1970-01-20 (expected: 2023+)\n"
+                "[2026-04-24 03:01:01] WARN ad_ranking.attribution: "
+                "Conversion events all timestamped <86400 (one day in 1970)"
+            ),
+        },
+    },
+    3: {
+        "whatsapp_sync": {
+            "ERROR": (
+                "[2026-04-24 03:10:00] ERROR whatsapp_sync.handler: "
+                "asyncpg.exceptions.TooManyConnectionsError: pool exhausted\n"
+                "[2026-04-24 03:10:02] ERROR whatsapp_sync.handler: "
+                "sync_user_messages acquire() blocked for user_id=8841923\n"
+                "[2026-04-24 03:10:05] ERROR whatsapp_sync.handler: "
+                "490/500 connections allocated — 0 available"
+            ),
+            "DEBUG": (
+                "[2026-04-24 03:09:00] DEBUG whatsapp_sync.handler: "
+                "sync_user_messages — db_pool.acquire() called\n"
+                "[2026-04-24 03:09:00] DEBUG whatsapp_sync.handler: "
+                "sync_user_messages — conn acquired, fetching messages\n"
+                "[2026-04-24 03:09:00] DEBUG whatsapp_sync.handler: "
+                "sync_user_messages — messages fetched, returning\n"
+                "NOTE: No 'release' log line — connection never returned to pool"
+            ),
+        },
+    },
+    4: {
+        "whatsapp_sync": {
+            "ERROR": (
+                "[2026-04-24 02:14:31] ERROR whatsapp_sync.db: "
+                "asyncpg.ForeignKeyViolationError: "
+                "insert into user_preferences violates FK constraint\n"
+                "[2026-04-24 02:14:31] ERROR whatsapp_sync.db: "
+                "migration 003 failed — circular FK: messages ↔ message_threads\n"
+                "[2026-04-24 02:14:31] ERROR whatsapp_sync.db: "
+                "ALTER TABLE messages failed — message_threads.id referenced before table commit"
+            ),
+        },
+        "ad_ranking": {
+            "ERROR": (
+                "[2026-04-24 02:15:00] ERROR ad_ranking: "
+                "DB pool returning FK violation errors from upstream\n"
+                "[2026-04-24 02:15:01] WARN ad_ranking: "
+                "This is a SYMPTOM — root cause is in whatsapp_sync migration"
+            ),
+        },
+        "capi_pipeline": {
+            "WARN": (
+                "[2026-04-24 02:15:00] WARN capi_pipeline: "
+                "Event association latency +340ms — DB pool contention\n"
+                "[2026-04-24 02:15:00] WARN capi_pipeline: "
+                "This is a SYMPTOM — root cause is in whatsapp_sync migration"
+            ),
+        },
+    },
+    5: {
+        "capi_pipeline": {
+            "DEBUG": (
+                "[2026-04-24 02:00:00] DEBUG capi_pipeline.ingestor: "
+                "DEBUG_MODE=True — including raw payload in response\n"
+                "[2026-04-24 02:00:00] DEBUG capi_pipeline.ingestor: "
+                "Response size: 14,382 bytes (expected ~48 bytes)\n"
+                "[2026-04-24 02:00:01] DEBUG capi_pipeline.ingestor: "
+                "debug_data.user_emails contains plaintext email fields"
+            ),
+            "ERROR": "[2026-04-24 02:00:00] INFO capi_pipeline: No errors — unit tests all pass",
+        },
+    },
+}
+_METRICS_HISTORY: Dict[str, Dict[str, list]] = {
+    "ad_ranking:error_rate": [
+        (0, 0.0), (1, 0.0), (2, 0.0), (3, 12.3), (4, 12.1), (5, 11.9),
+    ],
+    "capi_pipeline:error_rate": [
+        (0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0),
+    ],
+    "whatsapp_sync:memory_mb": [
+        (0, 200), (1, 250), (2, 350), (3, 500), (4, 800), (5, 1200),
+    ],
+    "whatsapp_sync:request_queue": [
+        (0, 5), (1, 45), (2, 130), (3, 280), (4, 420), (5, 490),
+    ],
+    "capi_pipeline:p99_latency_ms": [
+        (0, 20), (1, 20), (2, 21), (3, 1100), (4, 1080), (5, 1090),
+    ],
+}
+# ---------------------------------------------------------------------------
+# Senior SRE hints — contextually aware
+# ---------------------------------------------------------------------------
+def _senior_sre_hint(task_id: int, question: str, sre_memory: list, step: int) -> str:
+    question_lower = question.lower()
+    memory_text = " ".join(sre_memory).lower()
+    if task_id == 1:
+        if "get_clicks" in memory_text or "attributeerror" in question_lower:
+            return (
+                "Senior SRE: The AttributeError is very specific — 'dict' has no method "
+                "'get_clicks'. The ad objects coming from the API are plain Python dicts. "
+                "You need dict accessor syntax, not method call syntax. "
+                "Check line 22 of ranker.py."
+            )
+        return (
+            "Senior SRE: Look at the stack trace carefully. "
+            "The error is on the line that computes click_rate. "
+            "How are you accessing the 'clicks' field on the ad object?"
+        )
+    if task_id == 2:
+        if "timestamp" in memory_text or "1970" in question_lower or "normalize" in question_lower:
+            return (
+                "Senior SRE: The timestamp normalisation logic has an off-by-three-orders-of-magnitude "
+                "bug. A Unix second timestamp is ~10 digits. A Unix millisecond timestamp is ~13 digits. "
+                "The condition in _normalize_timestamp() uses the wrong threshold. "
+                "What number has 13 digits?"
+            )
+        if "capi" in memory_text or "capi" in question_lower:
+            return (
+                "Senior SRE: You're on the right track — CAPI is the data source for ad attribution. "
+                "Check the transformer.py file. The event_time values being emitted are wrong — "
+                "they look like they're in 1970. Where does event_time get processed?"
+            )
+        return (
+            "Senior SRE: The ad ranking ROAS drop is NOT a ranking algorithm bug. "
+            "The ranking model is working correctly — it just has bad input data. "
+            "Follow the data upstream. Where do conversion events come from?"
+        )
+    if task_id == 3:
+        if "finally" in memory_text or "release" in question_lower or "pool" in question_lower:
+            return (
+                "Senior SRE: Yes — the connection pool is exhausted because connections are "
+                "acquired but never released. In Python async code, you MUST release connections "
+                "in a 'finally' block, otherwise an exception will skip the release call entirely. "
+                "Add: finally: await self.db_pool.release(conn)"
+            )
+        return (
+            "Senior SRE: The load test shows pool exhaustion. "
+            "Under normal load each sync call is short so you don't notice. "
+            "Under high load, the missing resource cleanup compounds. "
+            "Look at how sync_user_messages() handles its DB connection lifecycle."
+        )
+    if task_id == 4:
+        if step > 3 and "migration" not in memory_text:
+            return (
+                "Senior SRE (impatient): Stop chasing symptoms! All three services degraded "
+                "simultaneously at 02:14 UTC — that's when the last deploy landed. "
+                "Check the DB migration logs. The whatsapp_sync service ran a new migration "
+                "at that exact time. Look at db.py migration 003."
+            )
+        if "migration" in memory_text or "003" in question_lower:
+            return (
+                "Senior SRE: Good, you found migration 003. Look at what it does: "
+                "it adds a column 'thread_id' to messages that references message_threads. "
+                "But message_threads also references messages. "
+                "That's a circular FK — PostgreSQL can't resolve the constraint. "
+                "Remove the ALTER TABLE statement from migration 003."
+            )
+        return (
+            "Senior SRE: Three services failing simultaneously at 02:14 UTC is not a coincidence. "
+            "Check the deploy logs and DB migration history for that exact timestamp."
+        )
+    if task_id == 5:
+        if "debug" in memory_text or "pii" in question_lower or "response" in question_lower:
+            return (
+                "Senior SRE: DEBUG_MODE = True should never reach production. "
+                "Find that flag in the ingestor and disable it. "
+                "The security test will verify the response body no longer contains 'debug_data'."
+            )
+        if step > 2:
+            return (
+                "Senior SRE: The unit tests pass — that's the trap. This is a security bug "
+                "invisible to unit tests. Run the security test suite instead: "
+                "run_tests('capi_pipeline', 'security'). "
+                "Read the DEBUG logs for the ingestor — you'll see the response sizes."
+            )
+        return (
+            "Senior SRE: Something is leaking data in the CAPI ingestor. "
+            "Response sizes are 70x larger than expected. "
+            "What conditions cause a larger response body?"
+        )
+    return "Senior SRE: Check the logs and follow the data flow upstream."
+# ---------------------------------------------------------------------------
+# Tool Dispatcher
+# ---------------------------------------------------------------------------
+class ToolDispatcher:
+    def __init__(self, episode: "EpisodeManager"):
+        self.ep = episode
+    def dispatch(self, tool: str, params: Dict[str, Any]) -> Tuple[float, bool, Any]:
+        """Route to the correct tool. Returns (reward_delta, done, output)."""
+        handlers = {
+            "view_file":             self._view_file,
+            "edit_line":             self._edit_line,
+            "run_tests":             self._run_tests,
+            "check_dependency":      self._check_dependency,
+            "read_logs":             self._read_logs,
+            "git_blame":             self._git_blame,
+            "rollback":              self._rollback,
+            "query_metrics_history": self._query_metrics_history,
+            "ask_senior_sre":        self._ask_senior_sre,
+            "write_incident_report": self._write_incident_report,
+        }
+        fn = handlers.get(tool)
+        if fn is None:
+            r = self.ep.reward.step_reward(tool)
+            return r, False, f"Unknown tool: {tool}"
+        return fn(params)
+    # ------------------------------------------------------------------
+    # 1. view_file
+    # ------------------------------------------------------------------
+    def _view_file(self, p: Dict) -> Tuple[float, bool, Any]:
+        service  = p.get("service", "")
+        filename = p.get("filename", "")
+        found, content = self.ep.vfs.read_file(service, filename)
+        if not found:
+            r = self.ep.reward.step_reward("view_file")
+            return r, False, {"error": content}
+        lines = content.splitlines()
+        numbered = "\n".join(f"{i+1:4d}  {line}" for i, line in enumerate(lines))
+        # Intermediate reward: opening the right file
+        task_def = {
+            1: ("ad_ranking", "ranker.py"),
+            2: ("capi_pipeline", "transformer.py"),
+            3: ("whatsapp_sync", "handler.py"),
+            4: ("whatsapp_sync", "db.py"),
+            5: ("capi_pipeline", "ingestor.py"),
+        }.get(self.ep._task_id)
+        r = self.ep.reward.step_reward("view_file")
+        if task_def and (service, filename) == task_def:
+            bonus = self.ep.reward.progress_reward("file_found")
+            r += bonus
+            self.ep.add_memory(f"opened root-cause file {service}/{filename}")
+        return r, False, {
+            "service":      service,
+            "filename":     filename,
+            "total_lines":  len(lines),
+            "content":      numbered,
+        }
+    # ------------------------------------------------------------------
+    # 2. edit_line
+    # ------------------------------------------------------------------
+    def _edit_line(self, p: Dict) -> Tuple[float, bool, Any]:
+        service     = p.get("service", "")
+        filename    = p.get("filename", "")
+        line_number = int(p.get("line_number", 0))
+        new_code    = p.get("new_code", "")
+        # Anti-cheat: SREs cannot modify test suites during an incident.
+        # Prevents reward hacking (e.g. deleting asserts to make tests pass).
+        PROTECTED = ("tests/", "test_", "_test.py", "conftest.py")
+        if any(guard in filename for guard in PROTECTED):
+            r = self.ep.reward.step_reward("edit_line", syntax_error=True)
+            return r, False, {
+                "error": (
+                    "Error: SREs cannot modify test suites during an incident. "
+                    "Fix the source code, not the tests."
+                )
+            }
+        success, msg = self.ep.vfs.edit_line(
+            service, filename, line_number, new_code, self.ep._step
+        )
+        if not success:
+            r = self.ep.reward.step_reward("edit_line")
+            return r, False, {"error": msg}
+        # Syntax check: look for obvious Python syntax errors in the new line
+        syntax_error = _has_syntax_error(new_code)
+        r = self.ep.reward.step_reward("edit_line", syntax_error=syntax_error)
+        self.ep.add_memory(
+            f"edited {service}/{filename} line {line_number}: "
+            f"{new_code[:60]!r}"
+        )
+        msg_out = f"Line {line_number} updated."
+        if syntax_error:
+            msg_out += " WARNING: possible syntax error detected in replacement line."
+        return r, False, {"result": msg_out, "syntax_warning": syntax_error}
+    # ------------------------------------------------------------------
+    # 3. run_tests
+    # ------------------------------------------------------------------
+    def _run_tests(self, p: Dict) -> Tuple[float, bool, Any]:
+        service = p.get("service", "")
+        suite   = p.get("suite", "unit")
+        # Suite cost (extra step penalties)
+        suite_cost = {"unit": 0, "integration": -0.1, "load": -0.2, "security": -0.1}
+        extra_cost = suite_cost.get(suite, 0)
+        passed, output, partial = self.ep.grader.run(self.ep._task_id, suite)
+        r = self.ep.reward.step_reward("run_tests") + extra_cost
+        self.ep._last_terminal = output
+        self.ep.add_memory(f"ran {suite} tests for {service}: {'PASS' if passed else 'FAIL'}")
+        if passed:
+            self.ep.metrics.mark_fixed(service)
+            r += self.ep.reward.progress_reward("error_drop")
+        return r, False, {"passed": passed, "suite": suite, "output": output}
+    # ------------------------------------------------------------------
+    # 4. check_dependency
+    # ------------------------------------------------------------------
+    def _check_dependency(self, p: Dict) -> Tuple[float, bool, Any]:
+        from app.engine.manager import DEPENDENCY_GRAPH
+        a = p.get("service_a", "")
+        b = p.get("service_b", "")
+        deps_a = DEPENDENCY_GRAPH.get(a, [])
+        deps_b = DEPENDENCY_GRAPH.get(b, [])
+        r = self.ep.reward.step_reward("check_dependency")
+        relationship = "no direct dependency"
+        if b in deps_a:
+            relationship = f"{a} depends on {b} (data flows: {b} → {a})"
+            self.ep.add_memory(f"confirmed: {a} depends on {b}")
+            r += self.ep.reward.progress_reward("service_id")
+        elif a in deps_b:
+            relationship = f"{b} depends on {a} (data flows: {a} → {b})"
+        return r, False, {
+            "service_a":      a,
+            "service_b":      b,
+            "relationship":   relationship,
+            f"{a}_depends_on": deps_a,
+            f"{b}_depends_on": deps_b,
+        }
+    # ------------------------------------------------------------------
+    # 5. read_logs
+    # ------------------------------------------------------------------
+    def _read_logs(self, p: Dict) -> Tuple[float, bool, Any]:
+        service   = p.get("service", "")
+        log_level = p.get("log_level", "ERROR")
+        n         = int(p.get("last_n_lines", 20))
+        task_logs = _TASK_LOGS.get(self.ep._task_id, {})
+        svc_logs  = task_logs.get(service, {})
+        log_text  = svc_logs.get(log_level, f"[{log_level}] No {log_level} logs for {service}")
+        r = self.ep.reward.step_reward("read_logs")
+        self.ep.add_memory(f"read {log_level} logs for {service}")
+        # Partial reward for reading the right service's debug/error logs
+        right_service = {
+            1: "ad_ranking", 2: "capi_pipeline", 3: "whatsapp_sync",
+            4: "whatsapp_sync", 5: "capi_pipeline",
+        }.get(self.ep._task_id)
+        if service == right_service and log_level in ("DEBUG", "ERROR"):
+            r += self.ep.reward.progress_reward("service_id")
+        return r, False, {"service": service, "log_level": log_level, "logs": log_text}
+    # ------------------------------------------------------------------
+    # 6. git_blame
+    # ------------------------------------------------------------------
+    def _git_blame(self, p: Dict) -> Tuple[float, bool, Any]:
+        service     = p.get("service", "")
+        filename    = p.get("filename", "")
+        line_number = int(p.get("line_number", 1))
+        blame = self.ep.vfs.git_blame(service, filename, line_number)
+        r = self.ep.reward.step_reward("git_blame")
+        self.ep.add_memory(f"git blame {service}/{filename}:{line_number}")
+        return r, False, {"blame": blame}
+    # ------------------------------------------------------------------
+    # 7. rollback
+    # ------------------------------------------------------------------
+    def _rollback(self, p: Dict) -> Tuple[float, bool, Any]:
+        service = p.get("service", "")
+        version = p.get("version", "")
+        # Only valid for Task 4 and correct service/version
+        is_correct = (
+            self.ep._task_id == 4 and
+            service == "whatsapp_sync" and
+            version == "003"
+        )
+        if is_correct:
+            # Remove the circular FK from the VFS (simulate rollback)
+            _, content = self.ep.vfs.read_file("whatsapp_sync", "db.py")
+            # Strip migration 003 block
+            lines = content.splitlines()
+            new_lines = []
+            skip = False
+            for line in lines:
+                if '"version": "003"' in line or "'version': '003'" in line:
+                    skip = True
+                if skip and line.strip().startswith("}"):
+                    skip = False
+                    continue
+                if not skip:
+                    new_lines.append(line)
+            self.ep.vfs._files["whatsapp_sync"]["db.py"] = "\n".join(new_lines)
+            self.ep.metrics.mark_fixed("whatsapp_sync")
+            self.ep.metrics.mark_fixed("ad_ranking")
+            self.ep.metrics.mark_fixed("capi_pipeline")
+            self.ep.add_memory("rolled back migration 003 — circular FK removed")
+            r = self.ep.reward.step_reward("rollback")
+            r += self.ep.reward.progress_reward("error_drop")
+            return r, False, {
+                "result": "Migration 003 rolled back successfully. All three services recovering."
+            }
+        # Wrong rollback — penalise
+        r = self.ep.reward.step_reward("rollback", syntax_error=False)
+        r += self.ep.reward.ROLLBACK_PENALTY  # extra penalty via RewardManager field
+        return r, False, {
+            "error": (
+                f"Rollback of {service} v{version} either unnecessary or incorrect. "
+                "Verify the root cause before rolling back."
+            )
+        }
+    # ------------------------------------------------------------------
+    # 8. query_metrics_history
+    # ------------------------------------------------------------------
+    def _query_metrics_history(self, p: Dict) -> Tuple[float, bool, Any]:
+        service    = p.get("service", "")
+        metric     = p.get("metric", "")
+        hours_back = int(p.get("hours_back", 6))
+        key = f"{service}:{metric}"
+        history = _METRICS_HISTORY.get(key, [])
+        r = self.ep.reward.step_reward("query_metrics_history")
+        self.ep.add_memory(f"queried {metric} history for {service}")
+        if history:
+            table = "\n".join(
+                f"  T-{hours_back - i}h: {val}" for i, (_, val) in enumerate(history)
+            )
+            return r, False, {
+                "service": service,
+                "metric":  metric,
+                "history": table,
+                "note":    f"Spike visible at T-{hours_back - 3}h (correlates with 02:14 UTC deploy)",
+            }
+        return r, False, {
+            "service": service,
+            "metric":  metric,
+            "history": "No historical data for this metric combination.",
+        }
+    # ------------------------------------------------------------------
+    # 9. ask_senior_sre
+    # ------------------------------------------------------------------
+    def _ask_senior_sre(self, p: Dict) -> Tuple[float, bool, Any]:
+        question = p.get("question", "")
+        hint = _senior_sre_hint(
+            self.ep._task_id,
+            question,
+            self.ep._sre_memory,
+            self.ep._step,
+        )
+        # 2-step penalty
+        r = self.ep.reward.step_reward("ask_senior_sre") * 2
+        self.ep.add_memory(f"asked senior SRE: {question[:60]}")
+        return r, False, {"hint": hint}
+    # ------------------------------------------------------------------
+    # 10. write_incident_report
+    # ------------------------------------------------------------------
+    def _write_incident_report(self, p: Dict) -> Tuple[float, bool, Any]:
+        from app.models import IncidentReport
+        report = IncidentReport(
+            root_cause=p.get("root_cause", ""),
+            fix_applied=p.get("fix_applied", ""),
+            services_affected=p.get("services_affected", []),
+            severity_classification=p.get("severity_classification", "P1"),
+        )
+        self.ep._incident_report = report
+        report_accuracy = self.ep.grader.grade_incident_report(self.ep._task_id, report)
+        task_def = {1: 15, 2: 20, 3: 20, 4: 25, 5: 20}
+        sla = task_def.get(self.ep._task_id, 20)
+        within_sla = self.ep._step <= sla
+        # Check if tests actually passed
+        passed, _, _ = self.ep.grader.run(self.ep._task_id)
+        no_regressions = passed
+        r = self.ep.reward.step_reward("write_incident_report")
+        r += self.ep.reward.terminal_reward(
+            tests_passed=passed,
+            report_accuracy=report_accuracy,
+            fixed_within_sla=within_sla,
+            no_regressions=no_regressions,
+            task_id=self.ep._task_id,
+        )
+        summary = (
+            f"Incident {self.ep._incident_id} closed.\n"
+            f"Report accuracy:  {report_accuracy:.0%}\n"
+            f"Tests passed:     {passed}\n"
+            f"Within SLA:       {within_sla}\n"
+            f"Normalized score: {self.ep.reward.normalized_score():.3f}"
+        )
+        # Update difficulty controller
+        self.ep.dc.update(self.ep._task_id, self.ep.reward.normalized_score())
+        return r, True, {"summary": summary, "report_accuracy": report_accuracy}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _has_syntax_error(line: str) -> bool:
+    """Quick heuristic check for obvious Python syntax mistakes in a single line."""
+    stripped = line.strip()
+    # Unmatched brackets
+    for open_, close_ in [("(", ")"), ("[", "]"), ("{", "}")]:
+        if stripped.count(open_) != stripped.count(close_):
+            return True
+    # Ends with lone colon inside dict/call (not a block statement)
+    # Detect obvious incomplete assignments
+    if re.search(r"=\s*$", stripped):
+        return True
+    return False

openenv.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+name: meta-sre
+version: "1.0.0"
+description: >
+  OpenEnv environment for training LLM agents to act as Senior SREs.
+  Simulates real Meta production incidents across 3 interconnected services
+  with 5 difficulty levels, 10 engineering tools, and a self-improving
+  difficulty controller (Theme 4: Self-Improvement).
+author: Meta-SRE Hackathon Team (Bhavya + Anvit)
+license: MIT
+endpoints:
+  base_url: http://localhost:8000
+  reset:  POST /reset
+  step:   POST /step
+  state:  GET  /state
+  grade:  GET  /grade
+  tools:  GET  /tools
+observation_space:
+  type: object
+  fields:
+    - step:              integer
+    - incident_id:       string
+    - system_metrics:    object   # {service: ServiceMetrics}
+    - active_alerts:     array    # List[Alert]
+    - open_file:         object   # FileView | null
+    - terminal_output:   string
+    - git_diff:          string   # null if no edits yet
+    - dependency_graph:  object
+    - sre_memory:        array    # agent's working notes
+    - budget_remaining:  integer  # steps before SLA breach
+action_space:
+  type: tool_call
+  tools:
+    - view_file
+    - edit_line
+    - run_tests
+    - check_dependency
+    - read_logs
+    - git_blame
+    - rollback
+    - query_metrics_history
+    - ask_senior_sre
+    - write_incident_report
+reward:
+  step_penalty:         -0.1
+  syntax_error_penalty: -0.5
+  rollback_penalty:     -1.0
+  senior_sre_penalty:   -0.2
+  terminal_tests_pass:  +1.0
+  terminal_report_max:  +0.5
+  terminal_sla_bonus:   +0.3
+  terminal_no_regress:  +0.2
+  security_patch_bonus: +0.5    # Task 5 only
+  max_possible:          3.0
+tasks:
+  - id: 1
+    difficulty: easy
+    sla_budget: 15
+    description: Single service AttributeError — hallucinated dict method
+  - id: 2
+    difficulty: medium
+    sla_budget: 20
+    description: Silent timestamp corruption in CAPI → ROAS degradation
+  - id: 3
+    difficulty: medium-hard
+    sla_budget: 20
+    description: DB connection pool exhaustion under load
+  - id: 4
+    difficulty: hard
+    sla_budget: 25
+    description: Circular FK migration cascading to 3 services (red herrings)
+  - id: 5
+    difficulty: hard
+    sla_budget: 20
+    description: PII data exposure via DEBUG_MODE=True (security incident)
+self_improvement:
+  enabled: true
+  controller: DifficultyController
+  description: >
+    After each episode the DifficultyController analyses which bug categories
+    the agent failed on and weights future task selection toward those weaknesses.
+    Bug categories: async_bugs, data_corruption, security_bugs,
+                    cascading_failures, red_herrings.
+usage_example: |
+  import requests
+  BASE = "http://localhost:8000"
+  obs   = requests.post(f"{BASE}/reset", json={"task_id": 1}).json()
+  done  = False
+  while not done:
+      action = your_agent.decide(obs)          # returns {"tool": ..., "params": ...}
+      result = requests.post(f"{BASE}/step", json=action).json()
+      obs    = result["observation"]
+      done   = result["done"]
+  score = requests.get(f"{BASE}/grade").json()["normalized_score"]
+  print(f"Score: {score:.3f}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# Meta-SRE – Python dependencies
+# Core server
+fastapi>=0.110.0
+uvicorn[standard]>=0.29.0
+pydantic>=2.0.0
+# Training (install separately in Colab — GPU required)
+# unsloth[colab-new]  @ git+https://github.com/unslothai/unsloth.git
+# trl>=0.9.0          # GRPOTrainer + GRPOConfig require >=0.9.0
+# datasets>=2.18.0
+# transformers>=4.39.0
+# accelerate>=0.28.0
+# bitsandbytes>=0.43.0
+# peft>=0.10.0
+# Data / evaluation
+numpy>=1.26.0
+matplotlib>=3.8.0
+seaborn>=0.13.0
+# Dev / testing
+pytest>=8.0.0
+httpx>=0.27.0        # for FastAPI TestClient

training/__init__.py ADDED Viewed

File without changes

training/generator.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""
+Layer 4 – Perfect-Play Bot & JSONL Dataset Generator.
+Runs all 5 tasks optimally to generate training episodes.
+Outputs: training/dataset/training_data.jsonl
+Usage:
+    python -m training.generator --episodes 40 --output training/dataset/training_data.jsonl
+"""
+from __future__ import annotations
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import json
+import random
+import argparse
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+from app.engine.manager import EpisodeManager, TASK_DEFINITIONS
+from app.engine.observability import DifficultyController
+# ---------------------------------------------------------------------------
+# Perfect-Play Scripts
+# Each script is a list of (tool, params) tuples that solve the task optimally.
+# Randomise is applied to variable names / details for dataset diversity.
+# ---------------------------------------------------------------------------
+def _vary(base: str, variants: List[str]) -> str:
+    return random.choice([base] + variants)
+def perfect_play_task1(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
+    """Hallucinated attribute: ad.get_clicks() → ad.get('clicks', 0)"""
+    return [
+        ("read_logs",  {"service": "ad_ranking", "log_level": "ERROR", "last_n_lines": 20}),
+        ("view_file",  {"service": "ad_ranking", "filename": "ranker.py"}),
+        ("git_blame",  {"service": "ad_ranking", "filename": "ranker.py", "line_number": 22}),
+        ("edit_line",  {
+            "service":     "ad_ranking",
+            "filename":    "ranker.py",
+            "line_number": 22,
+            "new_code":    _vary(
+                "            click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)",
+                ["            click_rate = ad['clicks'] / max(ad.get('impressions', 1), 1)"]
+            ),
+        }),
+        ("run_tests",  {"service": "ad_ranking", "suite": "unit"}),
+        ("write_incident_report", {
+            "root_cause":              "AttributeError: dict has no attribute get_clicks() — Junior AI generated method call instead of dict accessor",
+            "fix_applied":             "Replaced ad.get_clicks() with ad.get('clicks', 0) on ranker.py line 22",
+            "services_affected":       ["ad_ranking"],
+            "severity_classification": "P0",
+        }),
+    ]
+def perfect_play_task2(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
+    """Silent timestamp corruption: threshold 1e9 → 1e12"""
+    return [
+        ("read_logs",             {"service": "ad_ranking",    "log_level": "WARN", "last_n_lines": 20}),
+        ("check_dependency",      {"service_a": "ad_ranking",  "service_b": "capi_pipeline"}),
+        ("query_metrics_history", {"service": "capi_pipeline", "metric": "error_rate",    "hours_back": 6}),
+        ("read_logs",             {"service": "capi_pipeline", "log_level": "DEBUG", "last_n_lines": 20}),
+        ("view_file",             {"service": "capi_pipeline", "filename": "transformer.py"}),
+        ("edit_line",             {
+            "service":     "capi_pipeline",
+            "filename":    "transformer.py",
+            "line_number": 43,
+            "new_code":    "        if ts > 1_000_000_000_000:",
+        }),
+        ("run_tests",             {"service": "capi_pipeline", "suite": "integration"}),
+        ("write_incident_report", {
+            "root_cause":              "Timestamp normalisation threshold in capi_pipeline/transformer.py was 1e9 instead of 1e12 — unix-second timestamps treated as milliseconds, resulting in events attributed to 1970",
+            "fix_applied":             "Changed _normalize_timestamp threshold from 1_000_000_000 to 1_000_000_000_000 on transformer.py line 40",
+            "services_affected":       ["capi_pipeline", "ad_ranking"],
+            "severity_classification": "P1",
+        }),
+    ]
+def perfect_play_task3(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
+    """Connection pool exhaustion: add finally: await db_pool.release(conn)"""
+    return [
+        ("read_logs",             {"service": "whatsapp_sync", "log_level": "ERROR", "last_n_lines": 20}),
+        ("query_metrics_history", {"service": "whatsapp_sync", "metric": "request_queue", "hours_back": 4}),
+        ("view_file",             {"service": "whatsapp_sync", "filename": "handler.py"}),
+        ("git_blame",             {"service": "whatsapp_sync", "filename": "handler.py", "line_number": 35}),
+        ("run_tests",             {"service": "whatsapp_sync", "suite": "unit"}),
+        ("edit_line",             {
+            "service":     "whatsapp_sync",
+            "filename":    "handler.py",
+            "line_number": 35,
+            "new_code":    "            raise",
+        }),
+        ("edit_line",             {
+            "service":     "whatsapp_sync",
+            "filename":    "handler.py",
+            "line_number": 36,
+            "new_code":    "        finally:",
+        }),
+        ("edit_line",             {
+            "service":     "whatsapp_sync",
+            "filename":    "handler.py",
+            "line_number": 37,
+            "new_code":    "            await self.db_pool.release(conn)",
+        }),
+        ("run_tests",             {"service": "whatsapp_sync", "suite": "load"}),
+        ("write_incident_report", {
+            "root_cause":              "DB connection pool exhaustion in whatsapp_sync — sync_user_messages() acquires a connection but has no finally block to release it on exception, causing pool depletion under concurrent load",
+            "fix_applied":             "Added finally: await self.db_pool.release(conn) to sync_user_messages() in handler.py",
+            "services_affected":       ["whatsapp_sync"],
+            "severity_classification": "P1",
+        }),
+    ]
+def perfect_play_task4(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
+    """Circular FK in migration 003 cascading to all services"""
+    return [
+        ("read_logs",             {"service": "whatsapp_sync",  "log_level": "ERROR", "last_n_lines": 30}),
+        ("query_metrics_history", {"service": "capi_pipeline",  "metric": "p99_latency_ms", "hours_back": 6}),
+        ("view_file",             {"service": "whatsapp_sync",  "filename": "db.py"}),
+        ("git_blame",             {"service": "whatsapp_sync",  "filename": "db.py", "line_number": 45}),
+        ("run_tests",             {"service": "whatsapp_sync",  "suite": "unit"}),
+        ("rollback",              {"service": "whatsapp_sync",  "version": "003"}),
+        ("run_tests",             {"service": "whatsapp_sync",  "suite": "integration"}),
+        ("write_incident_report", {
+            "root_cause":              "Circular foreign key in migration 003: message_threads.parent_message_id references messages, and the ALTER TABLE added messages.thread_id referencing message_threads — PostgreSQL FK resolution failure cascaded to all DB pool consumers",
+            "fix_applied":             "Rolled back migration 003 to remove circular FK constraint",
+            "services_affected":       ["whatsapp_sync"],
+            "severity_classification": "P0",
+        }),
+    ]
+def perfect_play_task5(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
+    """PII data leak: DEBUG_MODE = True → False"""
+    return [
+        ("read_logs",  {"service": "capi_pipeline", "log_level": "DEBUG", "last_n_lines": 20}),
+        ("run_tests",  {"service": "capi_pipeline", "suite": "unit"}),
+        ("view_file",  {"service": "capi_pipeline", "filename": "ingestor.py"}),
+        ("git_blame",  {"service": "capi_pipeline", "filename": "ingestor.py", "line_number": 7}),
+        ("edit_line",  {
+            "service":     "capi_pipeline",
+            "filename":    "ingestor.py",
+            "line_number": 7,
+            "new_code":    "DEBUG_MODE = False  # FIXED: must be False in production",
+        }),
+        ("run_tests",  {"service": "capi_pipeline", "suite": "security"}),
+        ("write_incident_report", {
+            "root_cause":              "PII data exposure: DEBUG_MODE=True in production caused /ingest to return raw user PII (emails, phone numbers) in HTTP response body — invisible to unit tests, caught by security suite",
+            "fix_applied":             "Set DEBUG_MODE = False in capi_pipeline/ingestor.py line 7",
+            "services_affected":       ["capi_pipeline"],
+            "severity_classification": "P0",
+        }),
+    ]
+PERFECT_PLAY_SCRIPTS = {
+    1: perfect_play_task1,
+    2: perfect_play_task2,
+    3: perfect_play_task3,
+    4: perfect_play_task4,
+    5: perfect_play_task5,
+}
+# ---------------------------------------------------------------------------
+# Observation → prompt string formatter
+# ---------------------------------------------------------------------------
+def obs_to_prompt(obs: dict) -> str:
+    """Format the observation dict as the LLM system+user prompt."""
+    metrics_summary = []
+    for svc, m in obs.get("system_metrics", {}).items():
+        if isinstance(m, dict):
+            metrics_summary.append(
+                f"  {svc}: CPU={m.get('cpu_percent',0):.0f}% "
+                f"MEM={m.get('memory_mb',0):.0f}MB "
+                f"ERR={m.get('error_rate',0):.1f}/s "
+                f"STATUS={m.get('status','?')}"
+            )
+    alerts_summary = []
+    for a in obs.get("active_alerts", []):
+        if isinstance(a, dict):
+            alerts_summary.append(
+                f"  [{a.get('severity','?')}] {a.get('service','?')}: {a.get('message','')}"
+            )
+    return (
+        f"INCIDENT: {obs.get('incident_id','')}\n"
+        f"TASK: {obs.get('task_description','')}\n"
+        f"STEP: {obs.get('step',0)} | BUDGET: {obs.get('budget_remaining',0)} steps remaining\n\n"
+        f"SYSTEM METRICS:\n" + "\n".join(metrics_summary) + "\n\n"
+        f"ACTIVE ALERTS:\n" + ("\n".join(alerts_summary) or "  None") + "\n\n"
+        f"TERMINAL:\n{obs.get('terminal_output','')}\n\n"
+        f"SRE MEMORY:\n" + ("\n".join(f"  {m}" for m in obs.get("sre_memory", [])) or "  (empty)") + "\n"
+    )
+def action_to_response(tool: str, params: Dict) -> str:
+    """Format agent action as the assistant turn in the conversation."""
+    return json.dumps({"tool": tool, "params": params}, indent=2)
+# ---------------------------------------------------------------------------
+# Episode runner
+# ---------------------------------------------------------------------------
+def run_episode(task_id: int, ep: EpisodeManager) -> List[Dict]:
+    """Run one perfect-play episode. Returns conversation turns for JSONL."""
+    obs = ep.reset(task_id=task_id)
+    script_fn = PERFECT_PLAY_SCRIPTS[task_id]
+    actions = script_fn(ep)
+    turns = []
+    obs_dict = obs.model_dump()
+    system_prompt = (
+        "You are a Senior Site Reliability Engineer (SRE) at Meta. "
+        "You are debugging a live production incident. "
+        "Use the available tools methodically: read logs first, then inspect code, "
+        "make surgical single-line edits, verify with tests, and close with an incident report. "
+        "Never rewrite entire files. Always run tests after editing."
+    )
+    # Initial observation as first user turn
+    turns.append({
+        "role": "system",
+        "content": system_prompt,
+    })
+    turns.append({
+        "role": "user",
+        "content": obs_to_prompt(obs_dict),
+    })
+    for tool, params in actions:
+        # Assistant decides action
+        turns.append({
+            "role":    "assistant",
+            "content": action_to_response(tool, params),
+        })
+        # Execute in environment
+        try:
+            result = ep.step(tool=tool, params=params)
+            obs_dict = result.observation.model_dump()
+            # Next user turn = new observation
+            turns.append({
+                "role":    "user",
+                "content": obs_to_prompt(obs_dict),
+            })
+            if result.done:
+                break
+        except RuntimeError:
+            break
+    return turns
+# ---------------------------------------------------------------------------
+# Dataset generator
+# ---------------------------------------------------------------------------
+def generate_dataset(
+    episodes_per_task: int = 40,
+    output_path: str = "training/dataset/training_data.jsonl",
+    seed: int = 42,
+) -> None:
+    random.seed(seed)
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    ep = EpisodeManager(difficulty_controller=DifficultyController())
+    total = 0
+    task_counts = {t: 0 for t in range(1, 6)}
+    with open(output_path, "w", encoding="utf-8") as f:
+        for episode_idx in range(episodes_per_task * 5):
+            task_id = (episode_idx % 5) + 1
+            try:
+                turns = run_episode(task_id, ep)
+                result = ep.get_episode_result()
+                record = {
+                    "episode_id":       f"ep_{episode_idx:04d}",
+                    "task_id":          task_id,
+                    "normalized_score": result.normalized_score,
+                    "steps_taken":      result.steps_taken,
+                    "messages":         turns,
+                }
+                f.write(json.dumps(record) + "\n")
+                total += 1
+                task_counts[task_id] += 1
+                if episode_idx % 10 == 0:
+                    print(
+                        f"[{episode_idx:4d}/{episodes_per_task*5}] "
+                        f"task={task_id} score={result.normalized_score:.3f} "
+                        f"steps={result.steps_taken}"
+                    )
+            except Exception as e:
+                print(f"WARNING: episode {episode_idx} task {task_id} failed: {e}")
+    print(f"\nDataset written to {output_path}")
+    print(f"Total episodes: {total}")
+    for t, c in task_counts.items():
+        print(f"  Task {t}: {c} episodes")
+# ---------------------------------------------------------------------------
+# Baseline evaluator (for "before training" comparison)
+# ---------------------------------------------------------------------------
+def run_baseline_naive(task_id: int) -> float:
+    """
+    Simulate a naive LLM that immediately tries to rewrite a whole file.
+    Returns normalized score (expected ~0.18).
+    """
+    ep = EpisodeManager()
+    ep.reset(task_id=task_id)
+    # Naive agent: immediately tries to edit line 1 with garbage
+    ep.step("edit_line", {
+        "service":     "ad_ranking",
+        "filename":    "ranker.py",
+        "line_number": 1,
+        "new_code":    "# rewriting entire file... (hallucination)",
+    })
+    # Then writes incident report without fixing anything
+    ep.step("write_incident_report", {
+        "root_cause":              "unknown error in the code",
+        "fix_applied":             "rewrote the file",
+        "services_affected":       ["ad_ranking"],
+        "severity_classification": "P1",
+    })
+    return ep.reward.normalized_score()
+def evaluate_model(
+    model_name: str,
+    call_fn,        # callable(prompt: str) -> str  (returns JSON action)
+    n_tasks: int = 5,
+) -> Dict[str, Any]:
+    """
+    Evaluate any model against the environment.
+    call_fn receives the obs prompt string, returns a JSON string with {tool, params}.
+    """
+    import json as _json
+    ep = EpisodeManager()
+    scores = {}
+    for task_id in range(1, n_tasks + 1):
+        obs = ep.reset(task_id=task_id)
+        done = False
+        while not done and ep._step < 30:
+            prompt = obs_to_prompt(obs.dict())
+            try:
+                response = call_fn(prompt)
+                action = _json.loads(response)
+                result = ep.step(action["tool"], action.get("params", {}))
+                obs = result.observation
+                done = result.done
+            except Exception as e:
+                print(f"Model error on task {task_id}: {e}")
+                break
+        scores[f"task_{task_id}"] = ep.reward.normalized_score()
+    avg = sum(scores.values()) / len(scores)
+    scores["average"] = round(avg, 4)
+    print(f"\n{model_name} evaluation results:")
+    for k, v in scores.items():
+        print(f"  {k}: {v:.3f}")
+    return scores
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Meta-SRE dataset generator")
+    parser.add_argument("--episodes", type=int, default=40,
+                        help="Episodes per task (default: 40 → 200 total)")
+    parser.add_argument("--output",   type=str,
+                        default="training/dataset/training_data.jsonl")
+    parser.add_argument("--seed",     type=int, default=42)
+    args = parser.parse_args()
+    generate_dataset(
+        episodes_per_task=args.episodes,
+        output_path=args.output,
+        seed=args.seed,
+    )