Commit ·
ad6248e
0
Parent(s):
Deploy Meta-SRE OpenEnv benchmark FastAPI server
Browse filesFastAPI server implementing full OpenEnv standard API:
/reset, /step, /grade, /state, /tools, /health, /env/* aliases.
Runs 5 production incident simulation tasks with self-improving difficulty.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- .gitignore +14 -0
- Dockerfile +13 -0
- README.md +70 -0
- app/__init__.py +0 -0
- app/engine/__init__.py +0 -0
- app/engine/manager.py +598 -0
- app/engine/observability.py +470 -0
- app/engine/sandbox.py +1040 -0
- app/main.py +166 -0
- app/models.py +101 -0
- app/services/__init__.py +0 -0
- app/services/ad_ranking/__init__.py +0 -0
- app/services/capi_pipeline/__init__.py +0 -0
- app/services/whatsapp_sync/__init__.py +0 -0
- app/tools/__init__.py +0 -0
- app/tools/definitions.py +761 -0
- openenv.yaml +110 -0
- requirements.txt +25 -0
- training/__init__.py +0 -0
- training/generator.py +399 -0
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
.env
|
| 10 |
+
venv/
|
| 11 |
+
training/dataset/
|
| 12 |
+
training/train_unsloth.py
|
| 13 |
+
*.jsonl
|
| 14 |
+
*.png
|
Dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY app/ ./app/
|
| 9 |
+
COPY training/ ./training/
|
| 10 |
+
|
| 11 |
+
EXPOSE 7860
|
| 12 |
+
|
| 13 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Meta-SRE
|
| 3 |
+
emoji: 🔧
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
short_description: OpenEnv benchmark – train LLMs to debug Meta production incidents
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Meta-SRE OpenEnv Benchmark
|
| 13 |
+
|
| 14 |
+
A live simulation environment for training and evaluating LLM agents as Senior Site Reliability Engineers at Meta.
|
| 15 |
+
|
| 16 |
+
## Connect with openenv_client
|
| 17 |
+
|
| 18 |
+
```python
|
| 19 |
+
import openenv_client
|
| 20 |
+
|
| 21 |
+
env = openenv_client.connect("huggingface.co/spaces/Anvit25/Meta-SRE")
|
| 22 |
+
obs = env.reset(task_id=1)
|
| 23 |
+
|
| 24 |
+
done = False
|
| 25 |
+
while not done:
|
| 26 |
+
action = your_agent.decide(obs) # {"tool": ..., "params": {...}}
|
| 27 |
+
obs, reward, done, info = env.step(action)
|
| 28 |
+
|
| 29 |
+
score = env.grade()
|
| 30 |
+
print(f"Score: {score['normalized_score']:.3f}")
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Direct API
|
| 34 |
+
|
| 35 |
+
```python
|
| 36 |
+
import requests
|
| 37 |
+
|
| 38 |
+
BASE = "https://anvit25-meta-sre.hf.space"
|
| 39 |
+
|
| 40 |
+
obs = requests.post(f"{BASE}/reset", json={"task_id": 1}).json()
|
| 41 |
+
done = False
|
| 42 |
+
|
| 43 |
+
while not done:
|
| 44 |
+
action = your_agent.decide(obs)
|
| 45 |
+
result = requests.post(f"{BASE}/step", json=action).json()
|
| 46 |
+
obs = result["observation"]
|
| 47 |
+
done = result["done"]
|
| 48 |
+
|
| 49 |
+
score = requests.get(f"{BASE}/grade").json()["normalized_score"]
|
| 50 |
+
print(f"Score: {score:.3f}")
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
## Tasks
|
| 54 |
+
|
| 55 |
+
| ID | Difficulty | Description |
|
| 56 |
+
|----|-----------|-------------|
|
| 57 |
+
| 1 | Easy | AttributeError — hallucinated dict method in ad_ranking |
|
| 58 |
+
| 2 | Medium | Silent timestamp corruption (CAPI → ROAS degradation) |
|
| 59 |
+
| 3 | Medium-Hard | DB connection pool exhaustion under load |
|
| 60 |
+
| 4 | Hard | Circular FK migration cascading across services |
|
| 61 |
+
| 5 | Hard | PII data exposure via DEBUG_MODE=True |
|
| 62 |
+
|
| 63 |
+
## Endpoints
|
| 64 |
+
|
| 65 |
+
- `POST /reset` — start episode (`{"task_id": 1-5}`)
|
| 66 |
+
- `POST /step` — take action (`{"tool": "...", "params": {...}}`)
|
| 67 |
+
- `GET /state` — current observation
|
| 68 |
+
- `GET /grade` — episode score
|
| 69 |
+
- `GET /tools` — available tools list
|
| 70 |
+
- `GET /health` — health check
|
app/__init__.py
ADDED
|
File without changes
|
app/engine/__init__.py
ADDED
|
File without changes
|
app/engine/manager.py
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 3 – Task Grader, Reward Manager, and Episode Orchestrator.
|
| 3 |
+
|
| 4 |
+
TaskGrader : checks whether the current VFS state passes the hidden tests.
|
| 5 |
+
RewardManager: computes per-step and terminal rewards.
|
| 6 |
+
EpisodeManager: ties Layers 1-2-3 together and drives the OpenEnv step loop.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
import uuid
|
| 10 |
+
import time
|
| 11 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 12 |
+
|
| 13 |
+
from app.models import (
|
| 14 |
+
Observation, ActionResult, EpisodeResult, IncidentReport,
|
| 15 |
+
Alert, ServiceMetrics,
|
| 16 |
+
)
|
| 17 |
+
from app.engine.sandbox import VirtualFileSystem
|
| 18 |
+
from app.engine.observability import MetricsEngine, DifficultyController
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
# Task definitions
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
TASK_DEFINITIONS: Dict[int, Dict] = {
|
| 26 |
+
1: {
|
| 27 |
+
"description": (
|
| 28 |
+
"INCIDENT: All /rank requests are returning HTTP 500. "
|
| 29 |
+
"The ad-ranking service is crashing on every call. "
|
| 30 |
+
"Find and fix the bug in ad_ranking/ranker.py."
|
| 31 |
+
),
|
| 32 |
+
"sla_budget": 15,
|
| 33 |
+
"difficulty": "easy",
|
| 34 |
+
"bug_category": "data_corruption",
|
| 35 |
+
"affected_services": ["ad_ranking"],
|
| 36 |
+
},
|
| 37 |
+
2: {
|
| 38 |
+
"description": (
|
| 39 |
+
"INCIDENT: ROAS (Return on Ad Spend) has dropped 68% vs last week. "
|
| 40 |
+
"No services are crashing. Ad-ranking allocation decisions appear to be "
|
| 41 |
+
"based on conversion data from 1970. Trace the root cause."
|
| 42 |
+
),
|
| 43 |
+
"sla_budget": 20,
|
| 44 |
+
"difficulty": "medium",
|
| 45 |
+
"bug_category": "data_corruption",
|
| 46 |
+
"affected_services": ["capi_pipeline", "ad_ranking"],
|
| 47 |
+
},
|
| 48 |
+
3: {
|
| 49 |
+
"description": (
|
| 50 |
+
"INCIDENT: WhatsApp message sync works fine under normal load but "
|
| 51 |
+
"hangs under peak traffic (>50 concurrent users). DB connection pool "
|
| 52 |
+
"is exhausted. Fix the resource leak."
|
| 53 |
+
),
|
| 54 |
+
"sla_budget": 20,
|
| 55 |
+
"difficulty": "medium-hard",
|
| 56 |
+
"bug_category": "async_bugs",
|
| 57 |
+
"affected_services": ["whatsapp_sync"],
|
| 58 |
+
},
|
| 59 |
+
4: {
|
| 60 |
+
"description": (
|
| 61 |
+
"INCIDENT: Three services degraded simultaneously after the 02:14 UTC deploy. "
|
| 62 |
+
"Multiple P1 alerts are firing. Find the single root cause and fix it — "
|
| 63 |
+
"do NOT chase individual service symptoms."
|
| 64 |
+
),
|
| 65 |
+
"sla_budget": 25,
|
| 66 |
+
"difficulty": "hard",
|
| 67 |
+
"bug_category": "red_herrings",
|
| 68 |
+
"affected_services": ["whatsapp_sync", "ad_ranking", "capi_pipeline"],
|
| 69 |
+
},
|
| 70 |
+
5: {
|
| 71 |
+
"description": (
|
| 72 |
+
"INCIDENT: Security scan flagged unusual /ingest response sizes. "
|
| 73 |
+
"Standard unit tests all pass. Find and close the data-exposure vulnerability "
|
| 74 |
+
"in the CAPI ingestor. Write a P0 incident report."
|
| 75 |
+
),
|
| 76 |
+
"sla_budget": 20,
|
| 77 |
+
"difficulty": "hard",
|
| 78 |
+
"bug_category": "security_bugs",
|
| 79 |
+
"affected_services": ["capi_pipeline"],
|
| 80 |
+
},
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
DEPENDENCY_GRAPH: Dict[str, List[str]] = {
|
| 84 |
+
"ad_ranking": ["capi_pipeline"],
|
| 85 |
+
"capi_pipeline": [],
|
| 86 |
+
"whatsapp_sync": ["capi_pipeline"],
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
# Hidden graders — one per task
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
|
| 94 |
+
class TaskGrader:
|
| 95 |
+
"""
|
| 96 |
+
Checks the VFS content against hidden test criteria.
|
| 97 |
+
Returns (passed, test_output_string, partial_score 0-1).
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def __init__(self, vfs: VirtualFileSystem):
|
| 101 |
+
self.vfs = vfs
|
| 102 |
+
|
| 103 |
+
def run(self, task_id: int, suite: str = "unit") -> Tuple[bool, str, float]:
|
| 104 |
+
graders = {
|
| 105 |
+
1: self._grade_task1,
|
| 106 |
+
2: self._grade_task2,
|
| 107 |
+
3: self._grade_task3,
|
| 108 |
+
4: self._grade_task4,
|
| 109 |
+
5: self._grade_task5,
|
| 110 |
+
}
|
| 111 |
+
fn = graders.get(task_id)
|
| 112 |
+
if fn is None:
|
| 113 |
+
return False, "Unknown task", 0.0
|
| 114 |
+
return fn(suite)
|
| 115 |
+
|
| 116 |
+
# ------------------------------------------------------------------
|
| 117 |
+
# Task 1 – fix ad.get_clicks() → ad.get('clicks', 0)
|
| 118 |
+
# ------------------------------------------------------------------
|
| 119 |
+
def _grade_task1(self, suite: str) -> Tuple[bool, str, float]:
|
| 120 |
+
_, content = self.vfs.read_file("ad_ranking", "ranker.py")
|
| 121 |
+
has_bug = "ad.get_clicks()" in content
|
| 122 |
+
has_fix = "ad.get('clicks'" in content or "ad['clicks']" in content
|
| 123 |
+
|
| 124 |
+
if has_bug:
|
| 125 |
+
return False, (
|
| 126 |
+
"FAIL [unit] test_score_ads:\n"
|
| 127 |
+
" AttributeError: 'dict' object has no attribute 'get_clicks'\n"
|
| 128 |
+
" Line 22 still contains ad.get_clicks()\n"
|
| 129 |
+
" 1 test failed, 0 passed"
|
| 130 |
+
), 0.0
|
| 131 |
+
|
| 132 |
+
if has_fix:
|
| 133 |
+
return True, (
|
| 134 |
+
"PASS [unit] test_score_ads: OK\n"
|
| 135 |
+
"PASS [unit] test_rank_returns_sorted_list: OK\n"
|
| 136 |
+
"PASS [unit] test_fetch_candidate_ads: OK\n"
|
| 137 |
+
"3 tests passed in 0.04 s"
|
| 138 |
+
), 1.0
|
| 139 |
+
|
| 140 |
+
return False, (
|
| 141 |
+
"FAIL [unit] test_score_ads:\n"
|
| 142 |
+
" Fix applied but ad click-rate accessor is incorrect.\n"
|
| 143 |
+
" Expected: ad.get('clicks', 0) or ad['clicks']\n"
|
| 144 |
+
" 1 test failed"
|
| 145 |
+
), 0.2
|
| 146 |
+
|
| 147 |
+
# ------------------------------------------------------------------
|
| 148 |
+
# Task 2 – fix timestamp threshold 1_000_000_000 → 1_000_000_000_000
|
| 149 |
+
# ------------------------------------------------------------------
|
| 150 |
+
def _grade_task2(self, suite: str) -> Tuple[bool, str, float]:
|
| 151 |
+
_, content = self.vfs.read_file("capi_pipeline", "transformer.py")
|
| 152 |
+
has_bug = "1_000_000_000:" in content or "1000000000:" in content
|
| 153 |
+
# Only count as fixed if the bug line is gone AND the correct threshold is in code
|
| 154 |
+
# (not just in comments — the comment already contains 1_000_000_000_000)
|
| 155 |
+
code_lines = [l for l in content.splitlines() if not l.strip().startswith("#")]
|
| 156 |
+
code_only = "\n".join(code_lines)
|
| 157 |
+
has_fix = not has_bug and (
|
| 158 |
+
"1_000_000_000_000" in code_only or
|
| 159 |
+
"1000000000000" in code_only or
|
| 160 |
+
"1e12" in code_only or
|
| 161 |
+
"10**12" in code_only
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if suite == "unit" and not has_bug:
|
| 165 |
+
# Unit tests always pass because they don't check timestamp edge cases
|
| 166 |
+
return True, (
|
| 167 |
+
"PASS [unit] test_transform_purchase: OK\n"
|
| 168 |
+
"PASS [unit] test_batch_transform: OK\n"
|
| 169 |
+
"2 tests passed"
|
| 170 |
+
), 0.4
|
| 171 |
+
|
| 172 |
+
if suite == "integration":
|
| 173 |
+
if has_fix:
|
| 174 |
+
return True, (
|
| 175 |
+
"PASS [integration] test_timestamp_normalisation: OK\n"
|
| 176 |
+
" event_time 1700000000 → 1700000000 ✓\n"
|
| 177 |
+
" event_time 1700000000000 → 1700000000 ✓\n"
|
| 178 |
+
"PASS [integration] test_roas_attribution_accuracy: OK\n"
|
| 179 |
+
" ROAS attribution error: 0.2% (threshold: 5%)\n"
|
| 180 |
+
"2 tests passed"
|
| 181 |
+
), 1.0
|
| 182 |
+
else:
|
| 183 |
+
return False, (
|
| 184 |
+
"FAIL [integration] test_timestamp_normalisation:\n"
|
| 185 |
+
" event_time 1700000000 → 1700000 (expected: 1700000000)\n"
|
| 186 |
+
" Timestamps are being divided by 1000 incorrectly.\n"
|
| 187 |
+
" Root cause: threshold condition in _normalize_timestamp()\n"
|
| 188 |
+
"1 test failed"
|
| 189 |
+
), 0.0
|
| 190 |
+
|
| 191 |
+
# Default: run integration test
|
| 192 |
+
return self._grade_task2("integration")
|
| 193 |
+
|
| 194 |
+
# ------------------------------------------------------------------
|
| 195 |
+
# Task 3 – add finally: await self.db_pool.release(conn)
|
| 196 |
+
# ------------------------------------------------------------------
|
| 197 |
+
def _grade_task3(self, suite: str) -> Tuple[bool, str, float]:
|
| 198 |
+
_, content = self.vfs.read_file("whatsapp_sync", "handler.py")
|
| 199 |
+
has_finally = "finally:" in content
|
| 200 |
+
has_release = "db_pool.release(conn)" in content or "release(conn)" in content
|
| 201 |
+
|
| 202 |
+
if suite == "unit":
|
| 203 |
+
if not has_finally:
|
| 204 |
+
return False, (
|
| 205 |
+
"PASS [unit] test_sync_messages_basic: OK\n"
|
| 206 |
+
"PASS [unit] test_process_queue_empty: OK\n"
|
| 207 |
+
"WARNING: Unit tests pass but connection leak not detectable without load test\n"
|
| 208 |
+
"Run: run_tests('whatsapp_sync', 'load')"
|
| 209 |
+
), 0.3
|
| 210 |
+
|
| 211 |
+
return True, (
|
| 212 |
+
"PASS [unit] test_sync_messages_basic: OK\n"
|
| 213 |
+
"PASS [unit] test_connection_released_on_success: OK\n"
|
| 214 |
+
"PASS [unit] test_connection_released_on_exception: OK\n"
|
| 215 |
+
"3 tests passed"
|
| 216 |
+
), 0.6
|
| 217 |
+
|
| 218 |
+
if suite == "load":
|
| 219 |
+
if has_finally and has_release:
|
| 220 |
+
return True, (
|
| 221 |
+
"PASS [load] test_100_concurrent_syncs:\n"
|
| 222 |
+
" Peak connections: 18/100 (nominal)\n"
|
| 223 |
+
" All 100 requests completed\n"
|
| 224 |
+
" Memory stable at 210 MB\n"
|
| 225 |
+
"PASS [load] test_connection_pool_not_exhausted: OK\n"
|
| 226 |
+
"2 load tests passed"
|
| 227 |
+
), 1.0
|
| 228 |
+
else:
|
| 229 |
+
return False, (
|
| 230 |
+
"FAIL [load] test_100_concurrent_syncs:\n"
|
| 231 |
+
" TooManyConnectionsError after 23 concurrent requests\n"
|
| 232 |
+
" Connection pool exhausted — connections not being released\n"
|
| 233 |
+
" Hint: Check sync_user_messages() for missing finally block\n"
|
| 234 |
+
"1 load test failed"
|
| 235 |
+
), 0.0
|
| 236 |
+
|
| 237 |
+
return self._grade_task3("load")
|
| 238 |
+
|
| 239 |
+
# ------------------------------------------------------------------
|
| 240 |
+
# Task 4 – rollback migration 003 (remove circular FK)
|
| 241 |
+
# ------------------------------------------------------------------
|
| 242 |
+
def _grade_task4(self, suite: str) -> Tuple[bool, str, float]:
|
| 243 |
+
_, content = self.vfs.read_file("whatsapp_sync", "db.py")
|
| 244 |
+
has_circular_fk = (
|
| 245 |
+
"REFERENCES message_threads" in content and
|
| 246 |
+
"REFERENCES messages" in content
|
| 247 |
+
)
|
| 248 |
+
migration_003_present = '"version": "003"' in content or "'version': '003'" in content
|
| 249 |
+
|
| 250 |
+
if suite == "unit":
|
| 251 |
+
if has_circular_fk:
|
| 252 |
+
return False, (
|
| 253 |
+
"FAIL [unit] test_migration_003:\n"
|
| 254 |
+
" ForeignKeyViolationError: circular FK detected\n"
|
| 255 |
+
" messages.thread_id → message_threads.id\n"
|
| 256 |
+
" message_threads.parent_message_id → messages.id\n"
|
| 257 |
+
" Fix: remove ALTER TABLE messages ADD COLUMN thread_id ...\n"
|
| 258 |
+
"1 test failed"
|
| 259 |
+
), 0.0
|
| 260 |
+
|
| 261 |
+
if suite == "integration":
|
| 262 |
+
if not has_circular_fk:
|
| 263 |
+
return True, (
|
| 264 |
+
"PASS [integration] test_all_services_recover: OK\n"
|
| 265 |
+
" whatsapp_sync: healthy ✓\n"
|
| 266 |
+
" ad_ranking: healthy ✓\n"
|
| 267 |
+
" capi_pipeline: healthy ✓\n"
|
| 268 |
+
"PASS [integration] test_message_thread_creation: OK\n"
|
| 269 |
+
"2 tests passed"
|
| 270 |
+
), 1.0
|
| 271 |
+
else:
|
| 272 |
+
return False, (
|
| 273 |
+
"FAIL [integration] test_all_services_recover:\n"
|
| 274 |
+
" whatsapp_sync still critical — migration 003 circular FK active\n"
|
| 275 |
+
" Use rollback() tool or edit db.py to remove the circular reference\n"
|
| 276 |
+
"1 test failed"
|
| 277 |
+
), 0.0
|
| 278 |
+
|
| 279 |
+
return self._grade_task4("integration")
|
| 280 |
+
|
| 281 |
+
# ------------------------------------------------------------------
|
| 282 |
+
# Task 5 – set DEBUG_MODE = False
|
| 283 |
+
# ------------------------------------------------------------------
|
| 284 |
+
def _grade_task5(self, suite: str) -> Tuple[bool, str, float]:
|
| 285 |
+
_, content = self.vfs.read_file("capi_pipeline", "ingestor.py")
|
| 286 |
+
has_debug_true = "DEBUG_MODE = True" in content
|
| 287 |
+
has_debug_false = "DEBUG_MODE = False" in content
|
| 288 |
+
|
| 289 |
+
if suite == "unit":
|
| 290 |
+
return True, (
|
| 291 |
+
"PASS [unit] test_ingest_valid_payload: OK\n"
|
| 292 |
+
"PASS [unit] test_ingest_empty_payload: OK\n"
|
| 293 |
+
"WARNING: Unit tests do not cover security — run security suite\n"
|
| 294 |
+
"2 tests passed (security NOT verified)"
|
| 295 |
+
), 0.2
|
| 296 |
+
|
| 297 |
+
if suite == "security":
|
| 298 |
+
if has_debug_true:
|
| 299 |
+
return False, (
|
| 300 |
+
"FAIL [security] test_no_pii_in_response:\n"
|
| 301 |
+
" Response body contains 'debug_data' key with raw user PII\n"
|
| 302 |
+
" debug_data.user_emails exposed in HTTP response\n"
|
| 303 |
+
" debug_data.raw_payload exposed in HTTP response\n"
|
| 304 |
+
" CVSS score: 7.5 (High) — data exposure\n"
|
| 305 |
+
" Fix: set DEBUG_MODE = False\n"
|
| 306 |
+
"1 security test failed"
|
| 307 |
+
), 0.0
|
| 308 |
+
elif has_debug_false:
|
| 309 |
+
return True, (
|
| 310 |
+
"PASS [security] test_no_pii_in_response: OK\n"
|
| 311 |
+
" Response body: {'status': 'ok', 'processed': N} — no PII\n"
|
| 312 |
+
"PASS [security] test_response_size_nominal: OK\n"
|
| 313 |
+
" Avg response size: 48 bytes (was 14 KB)\n"
|
| 314 |
+
"PASS [security] test_debug_data_absent: OK\n"
|
| 315 |
+
"3 security tests passed — vulnerability closed"
|
| 316 |
+
), 1.0
|
| 317 |
+
else:
|
| 318 |
+
return False, (
|
| 319 |
+
"FAIL [security] test_no_pii_in_response:\n"
|
| 320 |
+
" DEBUG_MODE value is ambiguous or missing\n"
|
| 321 |
+
" Expected: DEBUG_MODE = False\n"
|
| 322 |
+
"1 security test failed"
|
| 323 |
+
), 0.1
|
| 324 |
+
|
| 325 |
+
return self._grade_task5("security")
|
| 326 |
+
|
| 327 |
+
# ------------------------------------------------------------------
|
| 328 |
+
# Incident report grader
|
| 329 |
+
# ------------------------------------------------------------------
|
| 330 |
+
|
| 331 |
+
def grade_incident_report(
|
| 332 |
+
self, task_id: int, report: IncidentReport
|
| 333 |
+
) -> float:
|
| 334 |
+
"""Score 0.0–1.0 for incident report accuracy."""
|
| 335 |
+
expected = {
|
| 336 |
+
1: {
|
| 337 |
+
"root_cause_keywords": ["get_clicks", "attributeerror", "dict", "attribute"],
|
| 338 |
+
"expected_services": ["ad_ranking"],
|
| 339 |
+
"severity": "P0",
|
| 340 |
+
},
|
| 341 |
+
2: {
|
| 342 |
+
"root_cause_keywords": ["timestamp", "1000", "normalize", "capi", "transformer"],
|
| 343 |
+
"expected_services": ["capi_pipeline", "ad_ranking"],
|
| 344 |
+
"severity": "P1",
|
| 345 |
+
},
|
| 346 |
+
3: {
|
| 347 |
+
"root_cause_keywords": ["connection", "pool", "release", "finally", "async"],
|
| 348 |
+
"expected_services": ["whatsapp_sync"],
|
| 349 |
+
"severity": "P1",
|
| 350 |
+
},
|
| 351 |
+
4: {
|
| 352 |
+
"root_cause_keywords": ["migration", "003", "foreign key", "circular", "fk"],
|
| 353 |
+
"expected_services": ["whatsapp_sync"],
|
| 354 |
+
"severity": "P0",
|
| 355 |
+
},
|
| 356 |
+
5: {
|
| 357 |
+
"root_cause_keywords": ["debug", "pii", "exposure", "ingest", "security"],
|
| 358 |
+
"expected_services": ["capi_pipeline"],
|
| 359 |
+
"severity": "P0",
|
| 360 |
+
},
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
cfg = expected.get(task_id, {})
|
| 364 |
+
if not cfg:
|
| 365 |
+
return 0.0
|
| 366 |
+
|
| 367 |
+
score = 0.0
|
| 368 |
+
root_cause_lower = report.root_cause.lower()
|
| 369 |
+
keywords = cfg.get("root_cause_keywords", [])
|
| 370 |
+
keyword_hits = sum(1 for kw in keywords if kw in root_cause_lower)
|
| 371 |
+
score += min(keyword_hits / max(len(keywords), 1), 1.0) * 0.5
|
| 372 |
+
|
| 373 |
+
expected_svcs = set(cfg.get("expected_services", []))
|
| 374 |
+
reported_svcs = set(s.lower() for s in report.services_affected)
|
| 375 |
+
svc_score = len(expected_svcs & reported_svcs) / max(len(expected_svcs), 1)
|
| 376 |
+
score += svc_score * 0.3
|
| 377 |
+
|
| 378 |
+
if report.severity_classification == cfg.get("severity"):
|
| 379 |
+
score += 0.2
|
| 380 |
+
|
| 381 |
+
return round(score, 3)
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
# ---------------------------------------------------------------------------
|
| 385 |
+
# Reward Manager
|
| 386 |
+
# ---------------------------------------------------------------------------
|
| 387 |
+
|
| 388 |
+
class RewardManager:
|
| 389 |
+
"""Computes step-level and terminal rewards."""
|
| 390 |
+
|
| 391 |
+
STEP_PENALTY = -0.1
|
| 392 |
+
SYNTAX_ERROR_PENALTY = -0.5
|
| 393 |
+
ROLLBACK_PENALTY = -1.0
|
| 394 |
+
SENIOR_SRE_PENALTY = -0.2
|
| 395 |
+
SYMPTOM_FIX_PENALTY = -0.3 # for Task 4 — fixing red herring services
|
| 396 |
+
|
| 397 |
+
PROGRESS_ERROR_DROP = +0.3 # error_rate drops >50%
|
| 398 |
+
PROGRESS_SERVICE_ID = +0.2 # correct root-cause service identified
|
| 399 |
+
PROGRESS_FILE_FOUND = +0.2 # correct file opened/edited
|
| 400 |
+
|
| 401 |
+
TERMINAL_TESTS_PASS = +1.0
|
| 402 |
+
TERMINAL_REPORT_MAX = +0.5
|
| 403 |
+
TERMINAL_SLA_BONUS = +0.3
|
| 404 |
+
TERMINAL_NO_REGRESS = +0.2
|
| 405 |
+
TERMINAL_SECURITY_PATCH = +0.5 # Task 5 only
|
| 406 |
+
|
| 407 |
+
MAX_POSSIBLE = 3.0
|
| 408 |
+
|
| 409 |
+
def __init__(self):
|
| 410 |
+
self._cumulative = 0.0
|
| 411 |
+
self._step_rewards: List[float] = []
|
| 412 |
+
|
| 413 |
+
def reset(self):
|
| 414 |
+
self._cumulative = 0.0
|
| 415 |
+
self._step_rewards.clear()
|
| 416 |
+
|
| 417 |
+
def step_reward(self, action: str, syntax_error: bool = False,
|
| 418 |
+
symptom_fix: bool = False) -> float:
|
| 419 |
+
r = self.STEP_PENALTY
|
| 420 |
+
if syntax_error:
|
| 421 |
+
r += self.SYNTAX_ERROR_PENALTY
|
| 422 |
+
if action == "rollback":
|
| 423 |
+
r += self.ROLLBACK_PENALTY
|
| 424 |
+
if action == "ask_senior_sre":
|
| 425 |
+
r += self.SENIOR_SRE_PENALTY
|
| 426 |
+
if symptom_fix:
|
| 427 |
+
r += self.SYMPTOM_FIX_PENALTY
|
| 428 |
+
self._cumulative += r
|
| 429 |
+
self._step_rewards.append(r)
|
| 430 |
+
return round(r, 3)
|
| 431 |
+
|
| 432 |
+
def progress_reward(self, reason: str) -> float:
|
| 433 |
+
mapping = {
|
| 434 |
+
"error_drop": self.PROGRESS_ERROR_DROP,
|
| 435 |
+
"service_id": self.PROGRESS_SERVICE_ID,
|
| 436 |
+
"file_found": self.PROGRESS_FILE_FOUND,
|
| 437 |
+
}
|
| 438 |
+
r = mapping.get(reason, 0.0)
|
| 439 |
+
self._cumulative += r
|
| 440 |
+
self._step_rewards.append(r)
|
| 441 |
+
return round(r, 3)
|
| 442 |
+
|
| 443 |
+
def terminal_reward(
|
| 444 |
+
self,
|
| 445 |
+
tests_passed: bool,
|
| 446 |
+
report_accuracy: float,
|
| 447 |
+
fixed_within_sla: bool,
|
| 448 |
+
no_regressions: bool,
|
| 449 |
+
task_id: int,
|
| 450 |
+
) -> float:
|
| 451 |
+
r = 0.0
|
| 452 |
+
if tests_passed:
|
| 453 |
+
r += self.TERMINAL_TESTS_PASS
|
| 454 |
+
r += report_accuracy * self.TERMINAL_REPORT_MAX
|
| 455 |
+
if fixed_within_sla:
|
| 456 |
+
r += self.TERMINAL_SLA_BONUS
|
| 457 |
+
if no_regressions:
|
| 458 |
+
r += self.TERMINAL_NO_REGRESS
|
| 459 |
+
if task_id == 5 and tests_passed:
|
| 460 |
+
r += self.TERMINAL_SECURITY_PATCH
|
| 461 |
+
self._cumulative += r
|
| 462 |
+
return round(r, 3)
|
| 463 |
+
|
| 464 |
+
def normalized_score(self) -> float:
|
| 465 |
+
return round(max(0.0, min(self._cumulative / self.MAX_POSSIBLE, 1.0)), 4)
|
| 466 |
+
|
| 467 |
+
@property
|
| 468 |
+
def total(self) -> float:
|
| 469 |
+
return round(self._cumulative, 4)
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
# ---------------------------------------------------------------------------
|
| 473 |
+
# Episode Manager – the main orchestrator
|
| 474 |
+
# ---------------------------------------------------------------------------
|
| 475 |
+
|
| 476 |
+
class EpisodeManager:
|
| 477 |
+
"""
|
| 478 |
+
Ties together VFS, MetricsEngine, TaskGrader, and RewardManager.
|
| 479 |
+
Exposes reset() and step() matching the OpenEnv contract.
|
| 480 |
+
"""
|
| 481 |
+
|
| 482 |
+
def __init__(self, difficulty_controller: Optional[DifficultyController] = None):
|
| 483 |
+
self.vfs = VirtualFileSystem()
|
| 484 |
+
self.metrics = MetricsEngine()
|
| 485 |
+
self.grader: Optional[TaskGrader] = None
|
| 486 |
+
self.reward = RewardManager()
|
| 487 |
+
self.dc = difficulty_controller or DifficultyController()
|
| 488 |
+
|
| 489 |
+
self._task_id: int = 0
|
| 490 |
+
self._step: int = 0
|
| 491 |
+
self._done: bool = False
|
| 492 |
+
self._incident_id: str = ""
|
| 493 |
+
self._sre_memory: List[str] = []
|
| 494 |
+
self._tool_call_log: List[Dict] = []
|
| 495 |
+
self._last_terminal: str = ""
|
| 496 |
+
self._incident_report: Optional[IncidentReport] = None
|
| 497 |
+
self._start_time: float = 0.0
|
| 498 |
+
|
| 499 |
+
# ------------------------------------------------------------------
|
| 500 |
+
# OpenEnv: reset
|
| 501 |
+
# ------------------------------------------------------------------
|
| 502 |
+
|
| 503 |
+
def reset(self, task_id: Optional[int] = None) -> Observation:
|
| 504 |
+
self._task_id = task_id or self.dc.next_task_id()
|
| 505 |
+
self._step = 0
|
| 506 |
+
self._done = False
|
| 507 |
+
self._incident_id = f"INC-{self._task_id}-{uuid.uuid4().hex[:6].upper()}"
|
| 508 |
+
self._sre_memory = []
|
| 509 |
+
self._tool_call_log = []
|
| 510 |
+
self._last_terminal = ""
|
| 511 |
+
self._incident_report = None
|
| 512 |
+
self._start_time = time.time()
|
| 513 |
+
|
| 514 |
+
self.vfs.reset(self._task_id)
|
| 515 |
+
self.metrics.reset(self._task_id)
|
| 516 |
+
self.grader = TaskGrader(self.vfs)
|
| 517 |
+
self.reward.reset()
|
| 518 |
+
|
| 519 |
+
return self._build_observation()
|
| 520 |
+
|
| 521 |
+
# ------------------------------------------------------------------
|
| 522 |
+
# OpenEnv: step
|
| 523 |
+
# ------------------------------------------------------------------
|
| 524 |
+
|
| 525 |
+
def step(self, tool: str, params: Dict[str, Any]) -> ActionResult:
|
| 526 |
+
if self._done:
|
| 527 |
+
raise RuntimeError("Episode is done. Call reset() to start a new episode.")
|
| 528 |
+
|
| 529 |
+
self._step += 1
|
| 530 |
+
self._tool_call_log.append({"step": self._step, "tool": tool, "params": params})
|
| 531 |
+
|
| 532 |
+
# Dispatch to tool handler
|
| 533 |
+
from app.tools.definitions import ToolDispatcher
|
| 534 |
+
dispatcher = ToolDispatcher(self)
|
| 535 |
+
reward_delta, done, tool_output = dispatcher.dispatch(tool, params)
|
| 536 |
+
|
| 537 |
+
self._done = done
|
| 538 |
+
obs = self._build_observation()
|
| 539 |
+
|
| 540 |
+
return ActionResult(
|
| 541 |
+
tool=tool,
|
| 542 |
+
output=tool_output,
|
| 543 |
+
reward_delta=reward_delta,
|
| 544 |
+
done=done,
|
| 545 |
+
observation=obs,
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
# ------------------------------------------------------------------
|
| 549 |
+
# Helpers
|
| 550 |
+
# ------------------------------------------------------------------
|
| 551 |
+
|
| 552 |
+
def _build_observation(self) -> Observation:
|
| 553 |
+
task_def = TASK_DEFINITIONS.get(self._task_id, {})
|
| 554 |
+
budget = task_def.get("sla_budget", 20) - self._step
|
| 555 |
+
|
| 556 |
+
return Observation(
|
| 557 |
+
step=self._step,
|
| 558 |
+
incident_id=self._incident_id,
|
| 559 |
+
system_metrics=self.metrics.get_metrics(self._step),
|
| 560 |
+
active_alerts=self.metrics.get_alerts(self._step),
|
| 561 |
+
terminal_output=self.metrics.get_terminal_output(
|
| 562 |
+
self._step, self._last_terminal or None
|
| 563 |
+
),
|
| 564 |
+
git_diff=self.vfs.build_git_diff(),
|
| 565 |
+
dependency_graph=DEPENDENCY_GRAPH,
|
| 566 |
+
sre_memory=list(self._sre_memory),
|
| 567 |
+
budget_remaining=max(budget, 0),
|
| 568 |
+
task_id=self._task_id,
|
| 569 |
+
task_description=task_def.get("description", ""),
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
def add_memory(self, entry: str) -> None:
|
| 573 |
+
self._sre_memory.append(f"[step {self._step}] {entry}")
|
| 574 |
+
|
| 575 |
+
def get_episode_result(self) -> EpisodeResult:
|
| 576 |
+
tests_passed = False
|
| 577 |
+
report_accuracy = 0.0
|
| 578 |
+
|
| 579 |
+
if self._incident_report:
|
| 580 |
+
report_accuracy = self.grader.grade_incident_report(
|
| 581 |
+
self._task_id, self._incident_report
|
| 582 |
+
)
|
| 583 |
+
|
| 584 |
+
task_def = TASK_DEFINITIONS.get(self._task_id, {})
|
| 585 |
+
fixed_within_sla = self._step <= task_def.get("sla_budget", 20)
|
| 586 |
+
|
| 587 |
+
return EpisodeResult(
|
| 588 |
+
incident_id=self._incident_id,
|
| 589 |
+
task_id=self._task_id,
|
| 590 |
+
steps_taken=self._step,
|
| 591 |
+
total_reward=self.reward.total,
|
| 592 |
+
normalized_score=self.reward.normalized_score(),
|
| 593 |
+
tests_passed=tests_passed,
|
| 594 |
+
incident_report_accuracy=report_accuracy,
|
| 595 |
+
fixed_within_sla=fixed_within_sla,
|
| 596 |
+
tool_call_log=list(self._tool_call_log),
|
| 597 |
+
weakness_tags=self.dc.weakness_tags(),
|
| 598 |
+
)
|
app/engine/observability.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 2 – Metrics Engine & Difficulty Controller.
|
| 3 |
+
|
| 4 |
+
Simulates realistic service metrics that evolve based on:
|
| 5 |
+
• which task is active
|
| 6 |
+
• what step we are on
|
| 7 |
+
• whether any fixes have been applied (VFS state)
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
import random
|
| 11 |
+
from typing import Dict, List, Optional, Tuple
|
| 12 |
+
|
| 13 |
+
from app.models import ServiceMetrics, Alert, DifficultyState
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
# Per-task metric profiles
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
|
| 20 |
+
HEALTHY_METRICS: Dict[str, dict] = {
|
| 21 |
+
"ad_ranking": dict(
|
| 22 |
+
cpu_percent=12.0, memory_mb=256.0, error_rate=0.0,
|
| 23 |
+
p99_latency_ms=45.0, request_queue=3, last_deploy="2026-04-23 01:00 UTC",
|
| 24 |
+
status="healthy",
|
| 25 |
+
),
|
| 26 |
+
"capi_pipeline": dict(
|
| 27 |
+
cpu_percent=8.0, memory_mb=180.0, error_rate=0.0,
|
| 28 |
+
p99_latency_ms=20.0, request_queue=0, last_deploy="2026-04-23 02:14 UTC",
|
| 29 |
+
status="healthy",
|
| 30 |
+
),
|
| 31 |
+
"whatsapp_sync": dict(
|
| 32 |
+
cpu_percent=10.0, memory_mb=200.0, error_rate=0.0,
|
| 33 |
+
p99_latency_ms=35.0, request_queue=5, last_deploy="2026-04-22 18:30 UTC",
|
| 34 |
+
status="healthy",
|
| 35 |
+
),
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _jitter(val: float, pct: float = 0.05) -> float:
|
| 40 |
+
return round(val * (1 + random.uniform(-pct, pct)), 2)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class MetricsEngine:
|
| 44 |
+
"""Generates per-step system metrics driven by task state."""
|
| 45 |
+
|
| 46 |
+
def __init__(self):
|
| 47 |
+
self._task_id: int = 0
|
| 48 |
+
self._fixed_services: set = set()
|
| 49 |
+
|
| 50 |
+
# ------------------------------------------------------------------
|
| 51 |
+
# Lifecycle
|
| 52 |
+
# ------------------------------------------------------------------
|
| 53 |
+
|
| 54 |
+
def reset(self, task_id: int) -> None:
|
| 55 |
+
self._task_id = task_id
|
| 56 |
+
self._fixed_services.clear()
|
| 57 |
+
|
| 58 |
+
def mark_fixed(self, service: str) -> None:
|
| 59 |
+
self._fixed_services.add(service)
|
| 60 |
+
|
| 61 |
+
def mark_unfixed(self, service: str) -> None:
|
| 62 |
+
self._fixed_services.discard(service)
|
| 63 |
+
|
| 64 |
+
# ------------------------------------------------------------------
|
| 65 |
+
# Core metric generation
|
| 66 |
+
# ------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
def get_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 69 |
+
builders = {
|
| 70 |
+
1: self._task1_metrics,
|
| 71 |
+
2: self._task2_metrics,
|
| 72 |
+
3: self._task3_metrics,
|
| 73 |
+
4: self._task4_metrics,
|
| 74 |
+
5: self._task5_metrics,
|
| 75 |
+
}
|
| 76 |
+
fn = builders.get(self._task_id, self._all_healthy)
|
| 77 |
+
return fn(step)
|
| 78 |
+
|
| 79 |
+
def _all_healthy(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 80 |
+
return {
|
| 81 |
+
svc: ServiceMetrics(**{k: _jitter(v) if isinstance(v, float) else v
|
| 82 |
+
for k, v in metrics.items()})
|
| 83 |
+
for svc, metrics in HEALTHY_METRICS.items()
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# ------------------------------------------------------------------
|
| 87 |
+
# Task 1 – ad_ranking crashes with AttributeError
|
| 88 |
+
# ------------------------------------------------------------------
|
| 89 |
+
def _task1_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 90 |
+
fixed = "ad_ranking" in self._fixed_services
|
| 91 |
+
return {
|
| 92 |
+
"ad_ranking": ServiceMetrics(
|
| 93 |
+
cpu_percent=_jitter(5.0 if fixed else 2.0),
|
| 94 |
+
memory_mb=_jitter(256.0),
|
| 95 |
+
error_rate=0.0 if fixed else _jitter(12.0),
|
| 96 |
+
p99_latency_ms=_jitter(45.0 if fixed else 0.0),
|
| 97 |
+
request_queue=3 if fixed else 0,
|
| 98 |
+
last_deploy="2026-04-23 02:14 UTC",
|
| 99 |
+
status="healthy" if fixed else "critical",
|
| 100 |
+
),
|
| 101 |
+
"capi_pipeline": ServiceMetrics(**{
|
| 102 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 103 |
+
for k, v in HEALTHY_METRICS["capi_pipeline"].items()
|
| 104 |
+
}),
|
| 105 |
+
"whatsapp_sync": ServiceMetrics(**{
|
| 106 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 107 |
+
for k, v in HEALTHY_METRICS["whatsapp_sync"].items()
|
| 108 |
+
}),
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# ------------------------------------------------------------------
|
| 112 |
+
# Task 2 – silent CAPI data corruption → ROAS degradation
|
| 113 |
+
# ------------------------------------------------------------------
|
| 114 |
+
def _task2_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 115 |
+
capi_fixed = "capi_pipeline" in self._fixed_services
|
| 116 |
+
ad_recovering = capi_fixed and step > 2 # needs a few steps to propagate
|
| 117 |
+
return {
|
| 118 |
+
"capi_pipeline": ServiceMetrics(
|
| 119 |
+
cpu_percent=_jitter(8.0),
|
| 120 |
+
memory_mb=_jitter(180.0),
|
| 121 |
+
error_rate=0.0, # no crash – silent corruption
|
| 122 |
+
p99_latency_ms=_jitter(20.0),
|
| 123 |
+
request_queue=0,
|
| 124 |
+
last_deploy="2026-04-23 02:14 UTC",
|
| 125 |
+
status="healthy", # deceptive – looks fine
|
| 126 |
+
),
|
| 127 |
+
"ad_ranking": ServiceMetrics(
|
| 128 |
+
cpu_percent=_jitter(12.0),
|
| 129 |
+
memory_mb=_jitter(256.0),
|
| 130 |
+
error_rate=0.0,
|
| 131 |
+
p99_latency_ms=_jitter(45.0),
|
| 132 |
+
request_queue=3,
|
| 133 |
+
last_deploy="2026-04-22 18:00 UTC",
|
| 134 |
+
# ROAS in custom_data would be degraded but not visible here
|
| 135 |
+
status="healthy" if ad_recovering else "degraded",
|
| 136 |
+
),
|
| 137 |
+
"whatsapp_sync": ServiceMetrics(**{
|
| 138 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 139 |
+
for k, v in HEALTHY_METRICS["whatsapp_sync"].items()
|
| 140 |
+
}),
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
# ------------------------------------------------------------------
|
| 144 |
+
# Task 3 – memory leak in whatsapp_sync under load
|
| 145 |
+
# ------------------------------------------------------------------
|
| 146 |
+
def _task3_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 147 |
+
fixed = "whatsapp_sync" in self._fixed_services
|
| 148 |
+
# Memory climbs 50 MB per step until fixed
|
| 149 |
+
leaked_mb = min(128.0 + (step * 50.0), 1800.0)
|
| 150 |
+
return {
|
| 151 |
+
"ad_ranking": ServiceMetrics(**{
|
| 152 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 153 |
+
for k, v in HEALTHY_METRICS["ad_ranking"].items()
|
| 154 |
+
}),
|
| 155 |
+
"capi_pipeline": ServiceMetrics(**{
|
| 156 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 157 |
+
for k, v in HEALTHY_METRICS["capi_pipeline"].items()
|
| 158 |
+
}),
|
| 159 |
+
"whatsapp_sync": ServiceMetrics(
|
| 160 |
+
cpu_percent=_jitter(10.0 if fixed else min(15 + step * 3, 90)),
|
| 161 |
+
memory_mb=_jitter(256.0 if fixed else leaked_mb),
|
| 162 |
+
error_rate=0.0 if fixed else _jitter(0.05 * max(step - 3, 0)),
|
| 163 |
+
p99_latency_ms=_jitter(35.0 if fixed else min(35 + step * 80, 8000)),
|
| 164 |
+
request_queue=5 if fixed else min(5 + step * 20, 500),
|
| 165 |
+
last_deploy="2026-04-22 18:30 UTC",
|
| 166 |
+
status="healthy" if fixed else (
|
| 167 |
+
"critical" if leaked_mb > 1200 else "degraded"
|
| 168 |
+
),
|
| 169 |
+
),
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# ------------------------------------------------------------------
|
| 173 |
+
# Task 4 – bad migration cascades to all three services
|
| 174 |
+
# ------------------------------------------------------------------
|
| 175 |
+
def _task4_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 176 |
+
migration_rolled_back = "whatsapp_sync" in self._fixed_services
|
| 177 |
+
return {
|
| 178 |
+
"ad_ranking": ServiceMetrics(
|
| 179 |
+
cpu_percent=_jitter(12.0),
|
| 180 |
+
memory_mb=_jitter(256.0),
|
| 181 |
+
error_rate=0.0 if migration_rolled_back else _jitter(3.5),
|
| 182 |
+
p99_latency_ms=_jitter(45.0 if migration_rolled_back else 2200.0),
|
| 183 |
+
request_queue=3 if migration_rolled_back else 150,
|
| 184 |
+
last_deploy="2026-04-23 02:00 UTC",
|
| 185 |
+
status="healthy" if migration_rolled_back else "degraded",
|
| 186 |
+
),
|
| 187 |
+
"capi_pipeline": ServiceMetrics(
|
| 188 |
+
cpu_percent=_jitter(8.0),
|
| 189 |
+
memory_mb=_jitter(180.0),
|
| 190 |
+
error_rate=0.0 if migration_rolled_back else _jitter(2.1),
|
| 191 |
+
p99_latency_ms=_jitter(20.0 if migration_rolled_back else 1100.0),
|
| 192 |
+
request_queue=0 if migration_rolled_back else 80,
|
| 193 |
+
last_deploy="2026-04-23 02:14 UTC",
|
| 194 |
+
status="healthy" if migration_rolled_back else "degraded",
|
| 195 |
+
),
|
| 196 |
+
"whatsapp_sync": ServiceMetrics(
|
| 197 |
+
cpu_percent=_jitter(10.0),
|
| 198 |
+
memory_mb=_jitter(200.0),
|
| 199 |
+
error_rate=0.0 if migration_rolled_back else _jitter(8.0),
|
| 200 |
+
p99_latency_ms=_jitter(35.0 if migration_rolled_back else 5000.0),
|
| 201 |
+
request_queue=5 if migration_rolled_back else 400,
|
| 202 |
+
last_deploy="2026-04-23 02:14 UTC",
|
| 203 |
+
status="healthy" if migration_rolled_back else "critical",
|
| 204 |
+
),
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
# ------------------------------------------------------------------
|
| 208 |
+
# Task 5 – PII data-leak (metrics look fine but security tests fail)
|
| 209 |
+
# ------------------------------------------------------------------
|
| 210 |
+
def _task5_metrics(self, step: int) -> Dict[str, ServiceMetrics]:
|
| 211 |
+
fixed = "capi_pipeline" in self._fixed_services
|
| 212 |
+
return {
|
| 213 |
+
"capi_pipeline": ServiceMetrics(
|
| 214 |
+
cpu_percent=_jitter(8.0),
|
| 215 |
+
memory_mb=_jitter(180.0),
|
| 216 |
+
error_rate=0.0, # no crash – silent security hole
|
| 217 |
+
p99_latency_ms=_jitter(20.0),
|
| 218 |
+
request_queue=0,
|
| 219 |
+
last_deploy="2026-04-23 02:14 UTC",
|
| 220 |
+
status="healthy", # deliberately deceptive
|
| 221 |
+
),
|
| 222 |
+
"ad_ranking": ServiceMetrics(**{
|
| 223 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 224 |
+
for k, v in HEALTHY_METRICS["ad_ranking"].items()
|
| 225 |
+
}),
|
| 226 |
+
"whatsapp_sync": ServiceMetrics(**{
|
| 227 |
+
k: _jitter(v) if isinstance(v, float) else v
|
| 228 |
+
for k, v in HEALTHY_METRICS["whatsapp_sync"].items()
|
| 229 |
+
}),
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
# ------------------------------------------------------------------
|
| 233 |
+
# Alerts
|
| 234 |
+
# ------------------------------------------------------------------
|
| 235 |
+
|
| 236 |
+
def get_alerts(self, step: int) -> List[Alert]:
|
| 237 |
+
alert_map = {
|
| 238 |
+
1: self._task1_alerts,
|
| 239 |
+
2: self._task2_alerts,
|
| 240 |
+
3: self._task3_alerts,
|
| 241 |
+
4: self._task4_alerts,
|
| 242 |
+
5: self._task5_alerts,
|
| 243 |
+
}
|
| 244 |
+
fn = alert_map.get(self._task_id, lambda s: [])
|
| 245 |
+
return fn(step)
|
| 246 |
+
|
| 247 |
+
def _task1_alerts(self, step: int) -> List[Alert]:
|
| 248 |
+
if "ad_ranking" in self._fixed_services:
|
| 249 |
+
return []
|
| 250 |
+
return [
|
| 251 |
+
Alert(
|
| 252 |
+
alert_id="ALT-001",
|
| 253 |
+
severity="P0",
|
| 254 |
+
service="ad_ranking",
|
| 255 |
+
message=(
|
| 256 |
+
"AttributeError: 'dict' object has no attribute 'get_clicks' "
|
| 257 |
+
"in ranker.py score_ads() — all ranking requests failing"
|
| 258 |
+
),
|
| 259 |
+
triggered_at_step=0,
|
| 260 |
+
is_red_herring=False,
|
| 261 |
+
)
|
| 262 |
+
]
|
| 263 |
+
|
| 264 |
+
def _task2_alerts(self, step: int) -> List[Alert]:
|
| 265 |
+
alerts = []
|
| 266 |
+
if "capi_pipeline" not in self._fixed_services:
|
| 267 |
+
alerts.append(Alert(
|
| 268 |
+
alert_id="ALT-002",
|
| 269 |
+
severity="P1",
|
| 270 |
+
service="ad_ranking",
|
| 271 |
+
message="ROAS dropped 68% vs 7-day average — attribution model seeing events from 1970",
|
| 272 |
+
triggered_at_step=0,
|
| 273 |
+
is_red_herring=False,
|
| 274 |
+
))
|
| 275 |
+
# Red herring – ad_ranking looks degraded but it's CAPI's fault
|
| 276 |
+
alerts.append(Alert(
|
| 277 |
+
alert_id="ALT-003",
|
| 278 |
+
severity="P2",
|
| 279 |
+
service="ad_ranking",
|
| 280 |
+
message="High memory pressure on ad-ranking pod — possible cache thrash",
|
| 281 |
+
triggered_at_step=0,
|
| 282 |
+
is_red_herring=True,
|
| 283 |
+
))
|
| 284 |
+
return alerts
|
| 285 |
+
|
| 286 |
+
def _task3_alerts(self, step: int) -> List[Alert]:
|
| 287 |
+
if "whatsapp_sync" in self._fixed_services:
|
| 288 |
+
return []
|
| 289 |
+
alerts = [Alert(
|
| 290 |
+
alert_id="ALT-004",
|
| 291 |
+
severity="P1" if step < 4 else "P0",
|
| 292 |
+
service="whatsapp_sync",
|
| 293 |
+
message=f"DB connection pool exhausted ({min(step * 20, 500)}/500 connections in use) — sync requests queuing",
|
| 294 |
+
triggered_at_step=1,
|
| 295 |
+
is_red_herring=False,
|
| 296 |
+
)]
|
| 297 |
+
if step > 3:
|
| 298 |
+
alerts.append(Alert(
|
| 299 |
+
alert_id="ALT-005",
|
| 300 |
+
severity="P1",
|
| 301 |
+
service="whatsapp_sync",
|
| 302 |
+
message="p99 latency > 5 s — SLA breach imminent",
|
| 303 |
+
triggered_at_step=4,
|
| 304 |
+
is_red_herring=False,
|
| 305 |
+
))
|
| 306 |
+
return alerts
|
| 307 |
+
|
| 308 |
+
def _task4_alerts(self, step: int) -> List[Alert]:
|
| 309 |
+
if "whatsapp_sync" in self._fixed_services:
|
| 310 |
+
return []
|
| 311 |
+
return [
|
| 312 |
+
Alert(
|
| 313 |
+
alert_id="ALT-006",
|
| 314 |
+
severity="P0",
|
| 315 |
+
service="whatsapp_sync",
|
| 316 |
+
message="FK violation: insert into user_preferences fails — migration 003 circular FK",
|
| 317 |
+
triggered_at_step=0,
|
| 318 |
+
is_red_herring=False,
|
| 319 |
+
),
|
| 320 |
+
# Red herrings – symptoms of the underlying migration failure
|
| 321 |
+
Alert(
|
| 322 |
+
alert_id="ALT-007",
|
| 323 |
+
severity="P1",
|
| 324 |
+
service="ad_ranking",
|
| 325 |
+
message="High error rate on /rank endpoint — upstream DB errors propagating",
|
| 326 |
+
triggered_at_step=0,
|
| 327 |
+
is_red_herring=True,
|
| 328 |
+
),
|
| 329 |
+
Alert(
|
| 330 |
+
alert_id="ALT-008",
|
| 331 |
+
severity="P1",
|
| 332 |
+
service="capi_pipeline",
|
| 333 |
+
message="Event ingest latency spike — shared DB pool contention",
|
| 334 |
+
triggered_at_step=0,
|
| 335 |
+
is_red_herring=True,
|
| 336 |
+
),
|
| 337 |
+
]
|
| 338 |
+
|
| 339 |
+
def _task5_alerts(self, step: int) -> List[Alert]:
|
| 340 |
+
if "capi_pipeline" in self._fixed_services:
|
| 341 |
+
return []
|
| 342 |
+
return [
|
| 343 |
+
Alert(
|
| 344 |
+
alert_id="ALT-009",
|
| 345 |
+
severity="P0",
|
| 346 |
+
service="capi_pipeline",
|
| 347 |
+
message=(
|
| 348 |
+
"SECURITY: Unusual response payload size on /ingest (avg 14 KB vs 0.2 KB) "
|
| 349 |
+
"— possible PII exposure in debug response body"
|
| 350 |
+
),
|
| 351 |
+
triggered_at_step=0,
|
| 352 |
+
is_red_herring=False,
|
| 353 |
+
),
|
| 354 |
+
Alert(
|
| 355 |
+
alert_id="ALT-010",
|
| 356 |
+
severity="P2",
|
| 357 |
+
service="capi_pipeline",
|
| 358 |
+
message="Slightly elevated memory on ingestor pod — likely buffer growth",
|
| 359 |
+
triggered_at_step=0,
|
| 360 |
+
is_red_herring=True,
|
| 361 |
+
),
|
| 362 |
+
]
|
| 363 |
+
|
| 364 |
+
# ------------------------------------------------------------------
|
| 365 |
+
# Terminal output (simulated stack traces / logs)
|
| 366 |
+
# ------------------------------------------------------------------
|
| 367 |
+
|
| 368 |
+
def get_terminal_output(self, step: int, last_test_result: Optional[str] = None) -> str:
|
| 369 |
+
if last_test_result:
|
| 370 |
+
return last_test_result
|
| 371 |
+
|
| 372 |
+
outputs = {
|
| 373 |
+
1: (
|
| 374 |
+
"Traceback (most recent call last):\n"
|
| 375 |
+
" File 'ad_ranking/ranker.py', line 22, in score_ads\n"
|
| 376 |
+
" click_rate = ad.get_clicks() / max(ad.get('impressions', 1), 1)\n"
|
| 377 |
+
"AttributeError: 'dict' object has no attribute 'get_clicks'\n"
|
| 378 |
+
"[CRITICAL] /rank endpoint returning 500 for all requests"
|
| 379 |
+
),
|
| 380 |
+
2: (
|
| 381 |
+
"[WARNING] ad_ranking: ROAS attribution anomaly detected\n"
|
| 382 |
+
" Expected event_time range: 1700000000 – 1745500000\n"
|
| 383 |
+
" Actual event_time range: 1700 – 1745500 (← timestamps in seconds / 1000!)\n"
|
| 384 |
+
"[INFO] capi_pipeline: All unit tests PASS\n"
|
| 385 |
+
"[INFO] capi_pipeline: Throughput 12,000 events/s — nominal\n"
|
| 386 |
+
"[WARNING] ad_ranking: Conversion window showing data from 1970-01-20"
|
| 387 |
+
),
|
| 388 |
+
3: (
|
| 389 |
+
"[INFO] whatsapp_sync: process_queue started\n"
|
| 390 |
+
"[ERROR] asyncpg.exceptions.TooManyConnectionsError: "
|
| 391 |
+
"connection pool exhausted (max=100)\n"
|
| 392 |
+
" Traceback: handler.py:sync_user_messages — acquire() blocked\n"
|
| 393 |
+
"[ERROR] Sync request for user 8841923 timed out after 30s\n"
|
| 394 |
+
"[CRITICAL] 487 pending sync requests queued"
|
| 395 |
+
),
|
| 396 |
+
4: (
|
| 397 |
+
"[ERROR] asyncpg.exceptions.ForeignKeyViolationError:\n"
|
| 398 |
+
" insert into user_preferences violates FK constraint "
|
| 399 |
+
"\"user_preferences_user_id_fkey\"\n"
|
| 400 |
+
" DETAIL: Key (user_id)=(48291) is not present in table \"users\".\n"
|
| 401 |
+
"[ERROR] whatsapp_sync: message thread creation failing\n"
|
| 402 |
+
"[WARNING] ad_ranking: upstream DB pool returning errors\n"
|
| 403 |
+
"[WARNING] capi_pipeline: event association latency +340ms\n"
|
| 404 |
+
" [HINT] Last DB migration was version 003 at 02:14 UTC today"
|
| 405 |
+
),
|
| 406 |
+
5: (
|
| 407 |
+
"[SECURITY SCAN] capi_pipeline /ingest endpoint\n"
|
| 408 |
+
" Response body contains keys: ['status', 'processed', 'debug_data']\n"
|
| 409 |
+
" debug_data.user_emails contains raw PII hashes + plaintext fields\n"
|
| 410 |
+
" debug_data.raw_payload contains full user submission data\n"
|
| 411 |
+
"[FAIL] Security test suite: test_no_pii_in_response FAILED\n"
|
| 412 |
+
"[INFO] Unit tests: all PASSING — bug invisible to standard tests"
|
| 413 |
+
),
|
| 414 |
+
}
|
| 415 |
+
return outputs.get(self._task_id, "[INFO] All systems operational")
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
# ---------------------------------------------------------------------------
|
| 419 |
+
# Difficulty Controller (Theme 4 – Self-Improvement Loop)
|
| 420 |
+
# ---------------------------------------------------------------------------
|
| 421 |
+
|
| 422 |
+
class DifficultyController:
|
| 423 |
+
"""
|
| 424 |
+
After each episode, analyse which bug categories the agent failed on.
|
| 425 |
+
Weight those categories higher so the next generated episode targets
|
| 426 |
+
the agent's current weaknesses.
|
| 427 |
+
"""
|
| 428 |
+
|
| 429 |
+
BUG_CATEGORY_MAP: Dict[int, str] = {
|
| 430 |
+
1: "data_corruption", # hallucinated attribute
|
| 431 |
+
2: "data_corruption", # silent timestamp corruption
|
| 432 |
+
3: "async_bugs", # connection leak
|
| 433 |
+
4: "red_herrings", # cascading failure + red herrings
|
| 434 |
+
5: "security_bugs", # PII leak
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
def __init__(self):
|
| 438 |
+
self.state = DifficultyState()
|
| 439 |
+
|
| 440 |
+
def update(self, task_id: int, normalized_score: float) -> None:
|
| 441 |
+
"""Increase weight for the bug category this agent struggled with."""
|
| 442 |
+
category = self.BUG_CATEGORY_MAP.get(task_id)
|
| 443 |
+
if category is None:
|
| 444 |
+
return
|
| 445 |
+
current = getattr(self.state, category)
|
| 446 |
+
if normalized_score < 0.5:
|
| 447 |
+
# Agent struggled – raise difficulty weight
|
| 448 |
+
setattr(self.state, category, min(current * 1.25, 3.0))
|
| 449 |
+
elif normalized_score > 0.8:
|
| 450 |
+
# Agent mastered it – slightly reduce weight
|
| 451 |
+
setattr(self.state, category, max(current * 0.9, 0.3))
|
| 452 |
+
|
| 453 |
+
def next_task_id(self) -> int:
|
| 454 |
+
"""Sample next task weighted by current weakness scores."""
|
| 455 |
+
import random
|
| 456 |
+
weights = [
|
| 457 |
+
(1, self.state.data_corruption),
|
| 458 |
+
(2, self.state.data_corruption),
|
| 459 |
+
(3, self.state.async_bugs),
|
| 460 |
+
(4, self.state.red_herrings),
|
| 461 |
+
(5, self.state.security_bugs),
|
| 462 |
+
]
|
| 463 |
+
task_ids, task_weights = zip(*weights)
|
| 464 |
+
total = sum(task_weights)
|
| 465 |
+
probs = [w / total for w in task_weights]
|
| 466 |
+
return random.choices(task_ids, weights=probs, k=1)[0]
|
| 467 |
+
|
| 468 |
+
def weakness_tags(self) -> List[str]:
|
| 469 |
+
d = self.state.dict()
|
| 470 |
+
return [k for k, v in d.items() if v > 0.7]
|
app/engine/sandbox.py
ADDED
|
@@ -0,0 +1,1040 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 1 – Virtual File System.
|
| 3 |
+
|
| 4 |
+
Stores all service codebases as in-memory strings keyed by
|
| 5 |
+
(service, filename). Every task starts from a clean snapshot of
|
| 6 |
+
its own buggy codebase; edits accumulate on top of that snapshot.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
import copy
|
| 10 |
+
from typing import Dict, List, Optional, Tuple
|
| 11 |
+
|
| 12 |
+
# ---------------------------------------------------------------------------
|
| 13 |
+
# Buggy source code snapshots – one per task
|
| 14 |
+
# Each task mutates only the files it needs; unchanged files are shared via
|
| 15 |
+
# SHARED_FILES and merged in at reset() time.
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
|
| 18 |
+
SHARED_FILES: Dict[str, Dict[str, str]] = {
|
| 19 |
+
"ad_ranking": {
|
| 20 |
+
"utils.py": """\
|
| 21 |
+
from typing import Dict, List
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def normalize_scores(ads: List[Dict]) -> List[Dict]:
|
| 25 |
+
if not ads:
|
| 26 |
+
return ads
|
| 27 |
+
max_score = max(ad['score'] for ad in ads)
|
| 28 |
+
min_score = min(ad['score'] for ad in ads)
|
| 29 |
+
score_range = max_score - min_score or 1.0
|
| 30 |
+
return [
|
| 31 |
+
{**ad, 'normalized_score': (ad['score'] - min_score) / score_range}
|
| 32 |
+
for ad in ads
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def filter_by_budget(ads: List[Dict], daily_budget_cents: int) -> List[Dict]:
|
| 37 |
+
return [ad for ad in ads if ad.get('spend_today_cents', 0) < daily_budget_cents]
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def compute_roas(revenue: float, spend: float) -> float:
|
| 41 |
+
return revenue / spend if spend > 0 else 0.0
|
| 42 |
+
""",
|
| 43 |
+
"models.py": """\
|
| 44 |
+
from dataclasses import dataclass, field
|
| 45 |
+
from typing import List, Optional
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class Ad:
|
| 50 |
+
ad_id: str
|
| 51 |
+
campaign_id: str
|
| 52 |
+
category: str
|
| 53 |
+
target_age: str
|
| 54 |
+
clicks: int = 0
|
| 55 |
+
impressions: int = 0
|
| 56 |
+
spend_today_cents: int = 0
|
| 57 |
+
active: bool = True
|
| 58 |
+
score: float = 0.0
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class UserContext:
|
| 63 |
+
user_id: str
|
| 64 |
+
interest: str
|
| 65 |
+
age_group: str
|
| 66 |
+
country: str
|
| 67 |
+
""",
|
| 68 |
+
},
|
| 69 |
+
"capi_pipeline": {
|
| 70 |
+
"validator.py": """\
|
| 71 |
+
from typing import Dict, Any
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
REQUIRED_FIELDS = {'event_name', 'event_time', 'event_id'}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def validate_event(event: Dict[str, Any]) -> Tuple[bool, str]:
|
| 78 |
+
missing = REQUIRED_FIELDS - set(event.keys())
|
| 79 |
+
if missing:
|
| 80 |
+
return False, f'Missing fields: {missing}'
|
| 81 |
+
if not isinstance(event.get('event_time'), (int, float)):
|
| 82 |
+
return False, 'event_time must be numeric'
|
| 83 |
+
return True, 'ok'
|
| 84 |
+
""",
|
| 85 |
+
},
|
| 86 |
+
"whatsapp_sync": {
|
| 87 |
+
"models.py": """\
|
| 88 |
+
from dataclasses import dataclass
|
| 89 |
+
from typing import Optional
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@dataclass
|
| 93 |
+
class Message:
|
| 94 |
+
id: int
|
| 95 |
+
user_id: int
|
| 96 |
+
sender_id: int
|
| 97 |
+
content: str
|
| 98 |
+
timestamp: int
|
| 99 |
+
synced: bool = False
|
| 100 |
+
thread_id: Optional[int] = None
|
| 101 |
+
""",
|
| 102 |
+
},
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
# Task-specific buggy snapshots
|
| 108 |
+
# ---------------------------------------------------------------------------
|
| 109 |
+
|
| 110 |
+
TASK_SNAPSHOTS: Dict[int, Dict[str, Dict[str, str]]] = {
|
| 111 |
+
|
| 112 |
+
# ------------------------------------------------------------------
|
| 113 |
+
# Task 1 – Easy: Hallucinated attribute (ad.get_clicks())
|
| 114 |
+
# ------------------------------------------------------------------
|
| 115 |
+
1: {
|
| 116 |
+
"ad_ranking": {
|
| 117 |
+
"ranker.py": """\
|
| 118 |
+
import logging
|
| 119 |
+
from typing import List, Dict
|
| 120 |
+
|
| 121 |
+
logger = logging.getLogger(__name__)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class AdRanker:
|
| 125 |
+
\"\"\"Scores and ranks candidate ads for a user.\"\"\"
|
| 126 |
+
|
| 127 |
+
def __init__(self, api_client):
|
| 128 |
+
self.api = api_client
|
| 129 |
+
self.model_version = "v2.3.1"
|
| 130 |
+
self._cache = {}
|
| 131 |
+
|
| 132 |
+
def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
|
| 133 |
+
ads = self.api.get_all_ads(user_id)
|
| 134 |
+
return [ad for ad in ads if ad.get('active', False)]
|
| 135 |
+
|
| 136 |
+
def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
|
| 137 |
+
scored = []
|
| 138 |
+
for ad in ads:
|
| 139 |
+
click_rate = ad.get_clicks() / max(ad.get('impressions', 1), 1)
|
| 140 |
+
relevance = self._compute_relevance(ad, user_context)
|
| 141 |
+
score = (click_rate * 0.4) + (relevance * 0.6)
|
| 142 |
+
scored.append({**ad, 'score': round(score, 4)})
|
| 143 |
+
return sorted(scored, key=lambda x: x['score'], reverse=True)
|
| 144 |
+
|
| 145 |
+
def _compute_relevance(self, ad: Dict, context: Dict) -> float:
|
| 146 |
+
category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
|
| 147 |
+
age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
|
| 148 |
+
return round((category_match + age_match) / 2.0, 4)
|
| 149 |
+
|
| 150 |
+
def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
|
| 151 |
+
candidates = self.fetch_candidate_ads(user_id)
|
| 152 |
+
if not candidates:
|
| 153 |
+
logger.warning(f"No candidates for user {user_id}")
|
| 154 |
+
return []
|
| 155 |
+
return self.score_ads(candidates, user_context)
|
| 156 |
+
""",
|
| 157 |
+
},
|
| 158 |
+
},
|
| 159 |
+
|
| 160 |
+
# ------------------------------------------------------------------
|
| 161 |
+
# Task 2 – Medium: Silent timestamp corruption in CAPI → bad ROAS
|
| 162 |
+
# ------------------------------------------------------------------
|
| 163 |
+
2: {
|
| 164 |
+
"capi_pipeline": {
|
| 165 |
+
"transformer.py": """\
|
| 166 |
+
import logging
|
| 167 |
+
from typing import Dict, Any, List
|
| 168 |
+
from datetime import datetime
|
| 169 |
+
|
| 170 |
+
logger = logging.getLogger(__name__)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class EventTransformer:
|
| 174 |
+
\"\"\"Transforms raw CAPI events into normalised format.\"\"\"
|
| 175 |
+
|
| 176 |
+
SUPPORTED_EVENTS = {
|
| 177 |
+
'Purchase', 'AddToCart', 'ViewContent', 'Lead', 'CompleteRegistration'
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
def __init__(self):
|
| 181 |
+
self._processed_count = 0
|
| 182 |
+
|
| 183 |
+
def transform(self, raw_event: Dict[str, Any]) -> Dict[str, Any]:
|
| 184 |
+
if raw_event.get('event_name') not in self.SUPPORTED_EVENTS:
|
| 185 |
+
logger.warning(f"Unknown event type: {raw_event.get('event_name')}")
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
event_time = self._normalize_timestamp(raw_event.get('event_time', 0))
|
| 189 |
+
|
| 190 |
+
transformed = {
|
| 191 |
+
'event_id': raw_event.get('event_id'),
|
| 192 |
+
'event_name': raw_event.get('event_name'),
|
| 193 |
+
'event_time': event_time,
|
| 194 |
+
'user_data': self._hash_user_data(raw_event.get('user_data', {})),
|
| 195 |
+
'custom_data': raw_event.get('custom_data', {}),
|
| 196 |
+
'processed_at': int(datetime.utcnow().timestamp()),
|
| 197 |
+
}
|
| 198 |
+
self._processed_count += 1
|
| 199 |
+
return transformed
|
| 200 |
+
|
| 201 |
+
def _normalize_timestamp(self, ts: Any) -> int:
|
| 202 |
+
\"\"\"Normalise event timestamp to Unix seconds.\"\"\"
|
| 203 |
+
ts = int(ts)
|
| 204 |
+
# BUG: threshold is 1_000_000_000 (10 digits) instead of
|
| 205 |
+
# 1_000_000_000_000 (13 digits for milliseconds).
|
| 206 |
+
# A normal unix-second timestamp like 1_700_000_000 passes the
|
| 207 |
+
# condition and gets divided by 1000 → year ~1970+20 days.
|
| 208 |
+
if ts > 1_000_000_000:
|
| 209 |
+
return ts // 1000
|
| 210 |
+
return ts
|
| 211 |
+
|
| 212 |
+
def _hash_user_data(self, user_data: Dict) -> Dict:
|
| 213 |
+
import hashlib
|
| 214 |
+
hashed = {}
|
| 215 |
+
for key, val in user_data.items():
|
| 216 |
+
if key in ('email', 'phone', 'fn', 'ln'):
|
| 217 |
+
hashed[key] = hashlib.sha256(
|
| 218 |
+
str(val).lower().encode()
|
| 219 |
+
).hexdigest()
|
| 220 |
+
else:
|
| 221 |
+
hashed[key] = val
|
| 222 |
+
return hashed
|
| 223 |
+
|
| 224 |
+
def batch_transform(self, events: List[Dict]) -> List[Dict]:
|
| 225 |
+
return [t for e in events if (t := self.transform(e)) is not None]
|
| 226 |
+
""",
|
| 227 |
+
"ingestor.py": """\
|
| 228 |
+
import logging
|
| 229 |
+
from typing import Dict, Any
|
| 230 |
+
from .transformer import EventTransformer
|
| 231 |
+
|
| 232 |
+
logger = logging.getLogger(__name__)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
class EventIngestor:
|
| 236 |
+
\"\"\"Ingests and validates CAPI event payloads.\"\"\"
|
| 237 |
+
|
| 238 |
+
def __init__(self, transformer: EventTransformer):
|
| 239 |
+
self.transformer = transformer
|
| 240 |
+
self._event_buffer = []
|
| 241 |
+
|
| 242 |
+
def ingest(self, raw_payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 243 |
+
try:
|
| 244 |
+
events = raw_payload.get('data', [])
|
| 245 |
+
if not events:
|
| 246 |
+
return {'status': 'error', 'message': 'No events in payload'}
|
| 247 |
+
transformed = self.transformer.batch_transform(events)
|
| 248 |
+
self._event_buffer.extend(transformed)
|
| 249 |
+
return {'status': 'ok', 'processed': len(transformed)}
|
| 250 |
+
except Exception as e:
|
| 251 |
+
logger.error(f"Ingest failed: {e}", exc_info=True)
|
| 252 |
+
return {'status': 'error', 'message': str(e)}
|
| 253 |
+
|
| 254 |
+
def flush(self) -> int:
|
| 255 |
+
count = len(self._event_buffer)
|
| 256 |
+
self._event_buffer.clear()
|
| 257 |
+
logger.info(f"Flushed {count} events")
|
| 258 |
+
return count
|
| 259 |
+
""",
|
| 260 |
+
},
|
| 261 |
+
"ad_ranking": {
|
| 262 |
+
"ranker.py": """\
|
| 263 |
+
import logging
|
| 264 |
+
from typing import List, Dict
|
| 265 |
+
|
| 266 |
+
logger = logging.getLogger(__name__)
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
class AdRanker:
|
| 270 |
+
\"\"\"Scores and ranks candidate ads for a user.\"\"\"
|
| 271 |
+
|
| 272 |
+
def __init__(self, api_client):
|
| 273 |
+
self.api = api_client
|
| 274 |
+
self.model_version = "v2.3.1"
|
| 275 |
+
|
| 276 |
+
def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
|
| 277 |
+
ads = self.api.get_all_ads(user_id)
|
| 278 |
+
return [ad for ad in ads if ad.get('active', False)]
|
| 279 |
+
|
| 280 |
+
def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
|
| 281 |
+
scored = []
|
| 282 |
+
for ad in ads:
|
| 283 |
+
click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)
|
| 284 |
+
relevance = self._compute_relevance(ad, user_context)
|
| 285 |
+
score = (click_rate * 0.4) + (relevance * 0.6)
|
| 286 |
+
scored.append({**ad, 'score': round(score, 4)})
|
| 287 |
+
return sorted(scored, key=lambda x: x['score'], reverse=True)
|
| 288 |
+
|
| 289 |
+
def _compute_relevance(self, ad: Dict, context: Dict) -> float:
|
| 290 |
+
category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
|
| 291 |
+
age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
|
| 292 |
+
return round((category_match + age_match) / 2.0, 4)
|
| 293 |
+
|
| 294 |
+
def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
|
| 295 |
+
candidates = self.fetch_candidate_ads(user_id)
|
| 296 |
+
if not candidates:
|
| 297 |
+
logger.warning(f"No candidates for user {user_id}")
|
| 298 |
+
return []
|
| 299 |
+
return self.score_ads(candidates, user_context)
|
| 300 |
+
""",
|
| 301 |
+
},
|
| 302 |
+
},
|
| 303 |
+
|
| 304 |
+
# ------------------------------------------------------------------
|
| 305 |
+
# Task 3 – Medium-Hard: DB connection leak in WhatsApp sync handler
|
| 306 |
+
# ------------------------------------------------------------------
|
| 307 |
+
3: {
|
| 308 |
+
"whatsapp_sync": {
|
| 309 |
+
"handler.py": """\
|
| 310 |
+
import asyncio
|
| 311 |
+
import logging
|
| 312 |
+
from typing import List, Dict
|
| 313 |
+
|
| 314 |
+
logger = logging.getLogger(__name__)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
class MessageSyncHandler:
|
| 318 |
+
\"\"\"Handles real-time WhatsApp message synchronisation.\"\"\"
|
| 319 |
+
|
| 320 |
+
def __init__(self, db_pool, message_queue):
|
| 321 |
+
self.db_pool = db_pool
|
| 322 |
+
self.queue = message_queue
|
| 323 |
+
self._sync_count = 0
|
| 324 |
+
|
| 325 |
+
async def sync_user_messages(self, user_id: str) -> List[Dict]:
|
| 326 |
+
\"\"\"Fetch and mark-as-synced all pending messages for a user.\"\"\"
|
| 327 |
+
conn = await self.db_pool.acquire()
|
| 328 |
+
try:
|
| 329 |
+
messages = await conn.fetch(
|
| 330 |
+
"SELECT id, content, sender_id, timestamp "
|
| 331 |
+
"FROM messages WHERE user_id = $1 AND synced = FALSE "
|
| 332 |
+
"ORDER BY timestamp",
|
| 333 |
+
user_id,
|
| 334 |
+
)
|
| 335 |
+
processed = []
|
| 336 |
+
for msg in messages:
|
| 337 |
+
await conn.execute(
|
| 338 |
+
"UPDATE messages SET synced = TRUE WHERE id = $1",
|
| 339 |
+
msg['id'],
|
| 340 |
+
)
|
| 341 |
+
processed.append(dict(msg))
|
| 342 |
+
self._sync_count += len(processed)
|
| 343 |
+
return processed
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.error(f"Sync failed for user {user_id}: {e}")
|
| 346 |
+
raise
|
| 347 |
+
# BUG: missing `finally: await self.db_pool.release(conn)`
|
| 348 |
+
# Under load the pool exhausts → all sync requests hang indefinitely.
|
| 349 |
+
|
| 350 |
+
async def process_queue(self, batch_size: int = 50) -> int:
|
| 351 |
+
processed = 0
|
| 352 |
+
while processed < batch_size:
|
| 353 |
+
try:
|
| 354 |
+
user_id = await asyncio.wait_for(
|
| 355 |
+
self.queue.get(), timeout=1.0
|
| 356 |
+
)
|
| 357 |
+
await self.sync_user_messages(user_id)
|
| 358 |
+
processed += 1
|
| 359 |
+
except asyncio.TimeoutError:
|
| 360 |
+
break
|
| 361 |
+
return processed
|
| 362 |
+
""",
|
| 363 |
+
"db.py": """\
|
| 364 |
+
import logging
|
| 365 |
+
from typing import Dict, List
|
| 366 |
+
|
| 367 |
+
logger = logging.getLogger(__name__)
|
| 368 |
+
|
| 369 |
+
MIGRATIONS: List[Dict] = [
|
| 370 |
+
{
|
| 371 |
+
"version": "001",
|
| 372 |
+
"description": "Create messages table",
|
| 373 |
+
"up": (
|
| 374 |
+
"CREATE TABLE messages ("
|
| 375 |
+
" id SERIAL PRIMARY KEY,"
|
| 376 |
+
" user_id INTEGER NOT NULL,"
|
| 377 |
+
" content TEXT,"
|
| 378 |
+
" sender_id INTEGER,"
|
| 379 |
+
" timestamp BIGINT,"
|
| 380 |
+
" synced BOOLEAN DEFAULT FALSE"
|
| 381 |
+
");"
|
| 382 |
+
),
|
| 383 |
+
},
|
| 384 |
+
]
|
| 385 |
+
|
| 386 |
+
|
| 387 |
+
class MigrationRunner:
|
| 388 |
+
def __init__(self, db_conn):
|
| 389 |
+
self.conn = db_conn
|
| 390 |
+
self._applied: List[str] = []
|
| 391 |
+
|
| 392 |
+
async def apply(self, migration: Dict) -> bool:
|
| 393 |
+
await self.conn.execute(migration['up'])
|
| 394 |
+
self._applied.append(migration['version'])
|
| 395 |
+
logger.info(f"Applied migration {migration['version']}")
|
| 396 |
+
return True
|
| 397 |
+
""",
|
| 398 |
+
},
|
| 399 |
+
},
|
| 400 |
+
|
| 401 |
+
# ------------------------------------------------------------------
|
| 402 |
+
# Task 4 – Hard: Red-herring cascade from a bad DB migration (003)
|
| 403 |
+
# ------------------------------------------------------------------
|
| 404 |
+
4: {
|
| 405 |
+
"whatsapp_sync": {
|
| 406 |
+
"db.py": """\
|
| 407 |
+
import logging
|
| 408 |
+
from typing import Dict, List
|
| 409 |
+
|
| 410 |
+
logger = logging.getLogger(__name__)
|
| 411 |
+
|
| 412 |
+
# Migration 003 introduces a circular FK:
|
| 413 |
+
# message_threads.parent_message_id → messages.id
|
| 414 |
+
# messages.thread_id → message_threads.id
|
| 415 |
+
# PostgreSQL refuses the self-referential constraint during ALTER TABLE,
|
| 416 |
+
# causing FK violation errors that cascade to all consumers of both tables.
|
| 417 |
+
|
| 418 |
+
MIGRATIONS: List[Dict] = [
|
| 419 |
+
{
|
| 420 |
+
"version": "001",
|
| 421 |
+
"description": "Create messages table",
|
| 422 |
+
"up": (
|
| 423 |
+
"CREATE TABLE IF NOT EXISTS messages ("
|
| 424 |
+
" id SERIAL PRIMARY KEY,"
|
| 425 |
+
" user_id INTEGER NOT NULL,"
|
| 426 |
+
" content TEXT,"
|
| 427 |
+
" sender_id INTEGER,"
|
| 428 |
+
" timestamp BIGINT,"
|
| 429 |
+
" synced BOOLEAN DEFAULT FALSE"
|
| 430 |
+
");"
|
| 431 |
+
),
|
| 432 |
+
},
|
| 433 |
+
{
|
| 434 |
+
"version": "002",
|
| 435 |
+
"description": "Add user preferences",
|
| 436 |
+
"up": (
|
| 437 |
+
"CREATE TABLE IF NOT EXISTS user_preferences ("
|
| 438 |
+
" id SERIAL PRIMARY KEY,"
|
| 439 |
+
" user_id INTEGER NOT NULL,"
|
| 440 |
+
" notification_enabled BOOLEAN DEFAULT TRUE,"
|
| 441 |
+
" sync_frequency INTEGER DEFAULT 30"
|
| 442 |
+
");"
|
| 443 |
+
),
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"version": "003",
|
| 447 |
+
"description": "Add message threads with back-reference",
|
| 448 |
+
"up": (
|
| 449 |
+
"CREATE TABLE IF NOT EXISTS message_threads ("
|
| 450 |
+
" id SERIAL PRIMARY KEY,"
|
| 451 |
+
" parent_message_id INTEGER REFERENCES messages(id) ON DELETE CASCADE,"
|
| 452 |
+
" participant_ids INTEGER[] NOT NULL,"
|
| 453 |
+
" created_at BIGINT"
|
| 454 |
+
");"
|
| 455 |
+
"ALTER TABLE messages"
|
| 456 |
+
" ADD COLUMN thread_id INTEGER REFERENCES message_threads(id);"
|
| 457 |
+
),
|
| 458 |
+
# BUG: circular FK — messages → message_threads → messages
|
| 459 |
+
# Fix: remove the ALTER TABLE line (messages should NOT reference threads)
|
| 460 |
+
},
|
| 461 |
+
]
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
class MigrationRunner:
|
| 465 |
+
def __init__(self, db_conn):
|
| 466 |
+
self.conn = db_conn
|
| 467 |
+
self._applied: List[str] = []
|
| 468 |
+
|
| 469 |
+
async def apply(self, migration: Dict) -> bool:
|
| 470 |
+
await self.conn.execute(migration['up'])
|
| 471 |
+
self._applied.append(migration['version'])
|
| 472 |
+
logger.info(f"Applied migration {migration['version']}: {migration['description']}")
|
| 473 |
+
return True
|
| 474 |
+
|
| 475 |
+
async def rollback_version(self, version: str) -> bool:
|
| 476 |
+
logger.warning(f"Rolling back migration {version}")
|
| 477 |
+
self._applied = [v for v in self._applied if v != version]
|
| 478 |
+
return True
|
| 479 |
+
|
| 480 |
+
async def run_all(self):
|
| 481 |
+
for migration in MIGRATIONS:
|
| 482 |
+
await self.apply(migration)
|
| 483 |
+
""",
|
| 484 |
+
"handler.py": """\
|
| 485 |
+
import asyncio
|
| 486 |
+
import logging
|
| 487 |
+
from typing import List, Dict
|
| 488 |
+
|
| 489 |
+
logger = logging.getLogger(__name__)
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
class MessageSyncHandler:
|
| 493 |
+
def __init__(self, db_pool, message_queue):
|
| 494 |
+
self.db_pool = db_pool
|
| 495 |
+
self.queue = message_queue
|
| 496 |
+
self._sync_count = 0
|
| 497 |
+
|
| 498 |
+
async def sync_user_messages(self, user_id: str) -> List[Dict]:
|
| 499 |
+
conn = await self.db_pool.acquire()
|
| 500 |
+
try:
|
| 501 |
+
messages = await conn.fetch(
|
| 502 |
+
"SELECT id, content, sender_id, timestamp "
|
| 503 |
+
"FROM messages WHERE user_id = $1 AND synced = FALSE "
|
| 504 |
+
"ORDER BY timestamp",
|
| 505 |
+
user_id,
|
| 506 |
+
)
|
| 507 |
+
processed = []
|
| 508 |
+
for msg in messages:
|
| 509 |
+
await conn.execute(
|
| 510 |
+
"UPDATE messages SET synced = TRUE WHERE id = $1",
|
| 511 |
+
msg['id'],
|
| 512 |
+
)
|
| 513 |
+
processed.append(dict(msg))
|
| 514 |
+
self._sync_count += len(processed)
|
| 515 |
+
return processed
|
| 516 |
+
except Exception as e:
|
| 517 |
+
logger.error(f"Sync failed for user {user_id}: {e}")
|
| 518 |
+
raise
|
| 519 |
+
finally:
|
| 520 |
+
await self.db_pool.release(conn)
|
| 521 |
+
|
| 522 |
+
async def process_queue(self, batch_size: int = 50) -> int:
|
| 523 |
+
processed = 0
|
| 524 |
+
while processed < batch_size:
|
| 525 |
+
try:
|
| 526 |
+
user_id = await asyncio.wait_for(
|
| 527 |
+
self.queue.get(), timeout=1.0
|
| 528 |
+
)
|
| 529 |
+
await self.sync_user_messages(user_id)
|
| 530 |
+
processed += 1
|
| 531 |
+
except asyncio.TimeoutError:
|
| 532 |
+
break
|
| 533 |
+
return processed
|
| 534 |
+
""",
|
| 535 |
+
},
|
| 536 |
+
"capi_pipeline": {
|
| 537 |
+
"ingestor.py": """\
|
| 538 |
+
import logging
|
| 539 |
+
from typing import Dict, Any
|
| 540 |
+
from .transformer import EventTransformer
|
| 541 |
+
|
| 542 |
+
logger = logging.getLogger(__name__)
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
class EventIngestor:
|
| 546 |
+
def __init__(self, transformer: EventTransformer):
|
| 547 |
+
self.transformer = transformer
|
| 548 |
+
self._event_buffer = []
|
| 549 |
+
|
| 550 |
+
def ingest(self, raw_payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 551 |
+
try:
|
| 552 |
+
events = raw_payload.get('data', [])
|
| 553 |
+
if not events:
|
| 554 |
+
return {'status': 'error', 'message': 'No events in payload'}
|
| 555 |
+
transformed = self.transformer.batch_transform(events)
|
| 556 |
+
self._event_buffer.extend(transformed)
|
| 557 |
+
return {'status': 'ok', 'processed': len(transformed)}
|
| 558 |
+
except Exception as e:
|
| 559 |
+
logger.error(f"Ingest failed: {e}", exc_info=True)
|
| 560 |
+
return {'status': 'error', 'message': str(e)}
|
| 561 |
+
|
| 562 |
+
def flush(self) -> int:
|
| 563 |
+
count = len(self._event_buffer)
|
| 564 |
+
self._event_buffer.clear()
|
| 565 |
+
return count
|
| 566 |
+
""",
|
| 567 |
+
"transformer.py": """\
|
| 568 |
+
import logging
|
| 569 |
+
from typing import Dict, Any, List
|
| 570 |
+
from datetime import datetime
|
| 571 |
+
|
| 572 |
+
logger = logging.getLogger(__name__)
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
class EventTransformer:
|
| 576 |
+
SUPPORTED_EVENTS = {
|
| 577 |
+
'Purchase', 'AddToCart', 'ViewContent', 'Lead', 'CompleteRegistration'
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
def __init__(self):
|
| 581 |
+
self._processed_count = 0
|
| 582 |
+
|
| 583 |
+
def transform(self, raw_event: Dict[str, Any]) -> Dict[str, Any]:
|
| 584 |
+
if raw_event.get('event_name') not in self.SUPPORTED_EVENTS:
|
| 585 |
+
return None
|
| 586 |
+
event_time = self._normalize_timestamp(raw_event.get('event_time', 0))
|
| 587 |
+
transformed = {
|
| 588 |
+
'event_id': raw_event.get('event_id'),
|
| 589 |
+
'event_name': raw_event.get('event_name'),
|
| 590 |
+
'event_time': event_time,
|
| 591 |
+
'user_data': raw_event.get('user_data', {}),
|
| 592 |
+
'custom_data': raw_event.get('custom_data', {}),
|
| 593 |
+
'processed_at': int(datetime.utcnow().timestamp()),
|
| 594 |
+
}
|
| 595 |
+
self._processed_count += 1
|
| 596 |
+
return transformed
|
| 597 |
+
|
| 598 |
+
def _normalize_timestamp(self, ts: Any) -> int:
|
| 599 |
+
ts = int(ts)
|
| 600 |
+
if ts > 1_000_000_000_000:
|
| 601 |
+
return ts // 1000
|
| 602 |
+
return ts
|
| 603 |
+
|
| 604 |
+
def batch_transform(self, events: List[Dict]) -> List[Dict]:
|
| 605 |
+
return [t for e in events if (t := self.transform(e)) is not None]
|
| 606 |
+
""",
|
| 607 |
+
},
|
| 608 |
+
"ad_ranking": {
|
| 609 |
+
"ranker.py": """\
|
| 610 |
+
import logging
|
| 611 |
+
from typing import List, Dict
|
| 612 |
+
|
| 613 |
+
logger = logging.getLogger(__name__)
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
class AdRanker:
|
| 617 |
+
def __init__(self, api_client):
|
| 618 |
+
self.api = api_client
|
| 619 |
+
self.model_version = "v2.3.1"
|
| 620 |
+
|
| 621 |
+
def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
|
| 622 |
+
ads = self.api.get_all_ads(user_id)
|
| 623 |
+
return [ad for ad in ads if ad.get('active', False)]
|
| 624 |
+
|
| 625 |
+
def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
|
| 626 |
+
scored = []
|
| 627 |
+
for ad in ads:
|
| 628 |
+
click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)
|
| 629 |
+
relevance = self._compute_relevance(ad, user_context)
|
| 630 |
+
score = (click_rate * 0.4) + (relevance * 0.6)
|
| 631 |
+
scored.append({**ad, 'score': round(score, 4)})
|
| 632 |
+
return sorted(scored, key=lambda x: x['score'], reverse=True)
|
| 633 |
+
|
| 634 |
+
def _compute_relevance(self, ad: Dict, context: Dict) -> float:
|
| 635 |
+
category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
|
| 636 |
+
age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
|
| 637 |
+
return round((category_match + age_match) / 2.0, 4)
|
| 638 |
+
|
| 639 |
+
def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
|
| 640 |
+
candidates = self.fetch_candidate_ads(user_id)
|
| 641 |
+
if not candidates:
|
| 642 |
+
return []
|
| 643 |
+
return self.score_ads(candidates, user_context)
|
| 644 |
+
""",
|
| 645 |
+
},
|
| 646 |
+
},
|
| 647 |
+
|
| 648 |
+
# ------------------------------------------------------------------
|
| 649 |
+
# Task 5 – Hard: PII data-leak via DEBUG_MODE=True in production
|
| 650 |
+
# ------------------------------------------------------------------
|
| 651 |
+
5: {
|
| 652 |
+
"capi_pipeline": {
|
| 653 |
+
"ingestor.py": """\
|
| 654 |
+
import logging
|
| 655 |
+
from typing import Dict, Any
|
| 656 |
+
from .transformer import EventTransformer
|
| 657 |
+
|
| 658 |
+
logger = logging.getLogger(__name__)
|
| 659 |
+
|
| 660 |
+
DEBUG_MODE = True # BUG: must be False in production – leaks raw user PII
|
| 661 |
+
|
| 662 |
+
|
| 663 |
+
class EventIngestor:
|
| 664 |
+
\"\"\"Ingests and validates CAPI event payloads.\"\"\"
|
| 665 |
+
|
| 666 |
+
def __init__(self, transformer: EventTransformer):
|
| 667 |
+
self.transformer = transformer
|
| 668 |
+
self._event_buffer = []
|
| 669 |
+
|
| 670 |
+
def ingest(self, raw_payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 671 |
+
try:
|
| 672 |
+
events = raw_payload.get('data', [])
|
| 673 |
+
if not events:
|
| 674 |
+
return {'status': 'error', 'message': 'No events in payload'}
|
| 675 |
+
|
| 676 |
+
transformed = self.transformer.batch_transform(events)
|
| 677 |
+
self._event_buffer.extend(transformed)
|
| 678 |
+
|
| 679 |
+
if DEBUG_MODE:
|
| 680 |
+
# SECURITY BUG: exposes raw PII (emails, phone numbers) in the
|
| 681 |
+
# HTTP response – visible in CDN logs, browser network tabs, etc.
|
| 682 |
+
return {
|
| 683 |
+
'status': 'ok',
|
| 684 |
+
'processed': len(transformed),
|
| 685 |
+
'debug_data': {
|
| 686 |
+
'raw_payload': raw_payload,
|
| 687 |
+
'user_emails': [e.get('user_data', {}) for e in events],
|
| 688 |
+
'buffer_state': self._event_buffer,
|
| 689 |
+
},
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
return {'status': 'ok', 'processed': len(transformed)}
|
| 693 |
+
|
| 694 |
+
except Exception as e:
|
| 695 |
+
logger.error(f"Ingest failed: {e}", exc_info=True)
|
| 696 |
+
return {'status': 'error', 'message': str(e)}
|
| 697 |
+
|
| 698 |
+
def flush(self) -> int:
|
| 699 |
+
count = len(self._event_buffer)
|
| 700 |
+
self._event_buffer.clear()
|
| 701 |
+
logger.info(f"Flushed {count} events")
|
| 702 |
+
return count
|
| 703 |
+
""",
|
| 704 |
+
"transformer.py": """\
|
| 705 |
+
import logging
|
| 706 |
+
from typing import Dict, Any, List
|
| 707 |
+
from datetime import datetime
|
| 708 |
+
|
| 709 |
+
logger = logging.getLogger(__name__)
|
| 710 |
+
|
| 711 |
+
|
| 712 |
+
class EventTransformer:
|
| 713 |
+
SUPPORTED_EVENTS = {
|
| 714 |
+
'Purchase', 'AddToCart', 'ViewContent', 'Lead', 'CompleteRegistration'
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
def __init__(self):
|
| 718 |
+
self._processed_count = 0
|
| 719 |
+
|
| 720 |
+
def transform(self, raw_event: Dict[str, Any]) -> Dict[str, Any]:
|
| 721 |
+
if raw_event.get('event_name') not in self.SUPPORTED_EVENTS:
|
| 722 |
+
return None
|
| 723 |
+
event_time = self._normalize_timestamp(raw_event.get('event_time', 0))
|
| 724 |
+
transformed = {
|
| 725 |
+
'event_id': raw_event.get('event_id'),
|
| 726 |
+
'event_name': raw_event.get('event_name'),
|
| 727 |
+
'event_time': event_time,
|
| 728 |
+
'user_data': self._hash_user_data(raw_event.get('user_data', {})),
|
| 729 |
+
'custom_data': raw_event.get('custom_data', {}),
|
| 730 |
+
'processed_at': int(datetime.utcnow().timestamp()),
|
| 731 |
+
}
|
| 732 |
+
self._processed_count += 1
|
| 733 |
+
return transformed
|
| 734 |
+
|
| 735 |
+
def _normalize_timestamp(self, ts: Any) -> int:
|
| 736 |
+
ts = int(ts)
|
| 737 |
+
if ts > 1_000_000_000_000:
|
| 738 |
+
return ts // 1000
|
| 739 |
+
return ts
|
| 740 |
+
|
| 741 |
+
def _hash_user_data(self, user_data: Dict) -> Dict:
|
| 742 |
+
import hashlib
|
| 743 |
+
hashed = {}
|
| 744 |
+
for key, val in user_data.items():
|
| 745 |
+
if key in ('email', 'phone', 'fn', 'ln'):
|
| 746 |
+
hashed[key] = hashlib.sha256(
|
| 747 |
+
str(val).lower().encode()
|
| 748 |
+
).hexdigest()
|
| 749 |
+
else:
|
| 750 |
+
hashed[key] = val
|
| 751 |
+
return hashed
|
| 752 |
+
|
| 753 |
+
def batch_transform(self, events: List[Dict]) -> List[Dict]:
|
| 754 |
+
return [t for e in events if (t := self.transform(e)) is not None]
|
| 755 |
+
""",
|
| 756 |
+
},
|
| 757 |
+
"ad_ranking": {
|
| 758 |
+
"ranker.py": """\
|
| 759 |
+
import logging
|
| 760 |
+
from typing import List, Dict
|
| 761 |
+
|
| 762 |
+
logger = logging.getLogger(__name__)
|
| 763 |
+
|
| 764 |
+
|
| 765 |
+
class AdRanker:
|
| 766 |
+
def __init__(self, api_client):
|
| 767 |
+
self.api = api_client
|
| 768 |
+
self.model_version = "v2.3.1"
|
| 769 |
+
|
| 770 |
+
def fetch_candidate_ads(self, user_id: str) -> List[Dict]:
|
| 771 |
+
ads = self.api.get_all_ads(user_id)
|
| 772 |
+
return [ad for ad in ads if ad.get('active', False)]
|
| 773 |
+
|
| 774 |
+
def score_ads(self, ads: List[Dict], user_context: Dict) -> List[Dict]:
|
| 775 |
+
scored = []
|
| 776 |
+
for ad in ads:
|
| 777 |
+
click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)
|
| 778 |
+
relevance = self._compute_relevance(ad, user_context)
|
| 779 |
+
score = (click_rate * 0.4) + (relevance * 0.6)
|
| 780 |
+
scored.append({**ad, 'score': round(score, 4)})
|
| 781 |
+
return sorted(scored, key=lambda x: x['score'], reverse=True)
|
| 782 |
+
|
| 783 |
+
def _compute_relevance(self, ad: Dict, context: Dict) -> float:
|
| 784 |
+
category_match = 1.0 if ad.get('category') == context.get('interest') else 0.3
|
| 785 |
+
age_match = 1.0 if ad.get('target_age') == context.get('age_group') else 0.5
|
| 786 |
+
return round((category_match + age_match) / 2.0, 4)
|
| 787 |
+
|
| 788 |
+
def rank(self, user_id: str, user_context: Dict) -> List[Dict]:
|
| 789 |
+
candidates = self.fetch_candidate_ads(user_id)
|
| 790 |
+
if not candidates:
|
| 791 |
+
return []
|
| 792 |
+
return self.score_ads(candidates, user_context)
|
| 793 |
+
""",
|
| 794 |
+
},
|
| 795 |
+
"whatsapp_sync": {
|
| 796 |
+
"handler.py": """\
|
| 797 |
+
import asyncio
|
| 798 |
+
import logging
|
| 799 |
+
from typing import List, Dict
|
| 800 |
+
|
| 801 |
+
logger = logging.getLogger(__name__)
|
| 802 |
+
|
| 803 |
+
|
| 804 |
+
class MessageSyncHandler:
|
| 805 |
+
def __init__(self, db_pool, message_queue):
|
| 806 |
+
self.db_pool = db_pool
|
| 807 |
+
self.queue = message_queue
|
| 808 |
+
self._sync_count = 0
|
| 809 |
+
|
| 810 |
+
async def sync_user_messages(self, user_id: str) -> List[Dict]:
|
| 811 |
+
conn = await self.db_pool.acquire()
|
| 812 |
+
try:
|
| 813 |
+
messages = await conn.fetch(
|
| 814 |
+
"SELECT id, content, sender_id, timestamp "
|
| 815 |
+
"FROM messages WHERE user_id = $1 AND synced = FALSE "
|
| 816 |
+
"ORDER BY timestamp",
|
| 817 |
+
user_id,
|
| 818 |
+
)
|
| 819 |
+
processed = []
|
| 820 |
+
for msg in messages:
|
| 821 |
+
await conn.execute(
|
| 822 |
+
"UPDATE messages SET synced = TRUE WHERE id = $1",
|
| 823 |
+
msg['id'],
|
| 824 |
+
)
|
| 825 |
+
processed.append(dict(msg))
|
| 826 |
+
self._sync_count += len(processed)
|
| 827 |
+
return processed
|
| 828 |
+
except Exception as e:
|
| 829 |
+
logger.error(f"Sync failed for user {user_id}: {e}")
|
| 830 |
+
raise
|
| 831 |
+
finally:
|
| 832 |
+
await self.db_pool.release(conn)
|
| 833 |
+
|
| 834 |
+
async def process_queue(self, batch_size: int = 50) -> int:
|
| 835 |
+
processed = 0
|
| 836 |
+
while processed < batch_size:
|
| 837 |
+
try:
|
| 838 |
+
user_id = await asyncio.wait_for(
|
| 839 |
+
self.queue.get(), timeout=1.0
|
| 840 |
+
)
|
| 841 |
+
await self.sync_user_messages(user_id)
|
| 842 |
+
processed += 1
|
| 843 |
+
except asyncio.TimeoutError:
|
| 844 |
+
break
|
| 845 |
+
return processed
|
| 846 |
+
""",
|
| 847 |
+
"db.py": """\
|
| 848 |
+
import logging
|
| 849 |
+
from typing import Dict, List
|
| 850 |
+
|
| 851 |
+
logger = logging.getLogger(__name__)
|
| 852 |
+
|
| 853 |
+
MIGRATIONS: List[Dict] = [
|
| 854 |
+
{
|
| 855 |
+
"version": "001",
|
| 856 |
+
"description": "Create messages table",
|
| 857 |
+
"up": (
|
| 858 |
+
"CREATE TABLE IF NOT EXISTS messages ("
|
| 859 |
+
" id SERIAL PRIMARY KEY,"
|
| 860 |
+
" user_id INTEGER NOT NULL,"
|
| 861 |
+
" content TEXT,"
|
| 862 |
+
" sender_id INTEGER,"
|
| 863 |
+
" timestamp BIGINT,"
|
| 864 |
+
" synced BOOLEAN DEFAULT FALSE"
|
| 865 |
+
");"
|
| 866 |
+
),
|
| 867 |
+
},
|
| 868 |
+
]
|
| 869 |
+
|
| 870 |
+
|
| 871 |
+
class MigrationRunner:
|
| 872 |
+
def __init__(self, db_conn):
|
| 873 |
+
self.conn = db_conn
|
| 874 |
+
self._applied: List[str] = []
|
| 875 |
+
|
| 876 |
+
async def apply(self, migration: Dict) -> bool:
|
| 877 |
+
await self.conn.execute(migration['up'])
|
| 878 |
+
self._applied.append(migration['version'])
|
| 879 |
+
return True
|
| 880 |
+
""",
|
| 881 |
+
},
|
| 882 |
+
},
|
| 883 |
+
}
|
| 884 |
+
|
| 885 |
+
|
| 886 |
+
# ---------------------------------------------------------------------------
|
| 887 |
+
# VirtualFileSystem
|
| 888 |
+
# ---------------------------------------------------------------------------
|
| 889 |
+
|
| 890 |
+
class EditRecord:
|
| 891 |
+
__slots__ = ("step", "service", "filename", "line_idx", "old_code", "new_code")
|
| 892 |
+
|
| 893 |
+
def __init__(self, step, service, filename, line_idx, old_code, new_code):
|
| 894 |
+
self.step = step
|
| 895 |
+
self.service = service
|
| 896 |
+
self.filename = filename
|
| 897 |
+
self.line_idx = line_idx
|
| 898 |
+
self.old_code = old_code
|
| 899 |
+
self.new_code = new_code
|
| 900 |
+
|
| 901 |
+
def to_dict(self):
|
| 902 |
+
return {
|
| 903 |
+
"step": self.step,
|
| 904 |
+
"service": self.service,
|
| 905 |
+
"filename": self.filename,
|
| 906 |
+
"line_number": self.line_idx + 1,
|
| 907 |
+
"old_code": self.old_code,
|
| 908 |
+
"new_code": self.new_code,
|
| 909 |
+
}
|
| 910 |
+
|
| 911 |
+
|
| 912 |
+
class VirtualFileSystem:
|
| 913 |
+
"""In-memory multi-service file system with history tracking."""
|
| 914 |
+
|
| 915 |
+
def __init__(self):
|
| 916 |
+
self._files: Dict[str, Dict[str, str]] = {}
|
| 917 |
+
self._history: List[EditRecord] = []
|
| 918 |
+
self._task_id: int = 0
|
| 919 |
+
|
| 920 |
+
# ------------------------------------------------------------------
|
| 921 |
+
# Lifecycle
|
| 922 |
+
# ------------------------------------------------------------------
|
| 923 |
+
|
| 924 |
+
def reset(self, task_id: int) -> None:
|
| 925 |
+
"""Load the buggy snapshot for a specific task."""
|
| 926 |
+
self._task_id = task_id
|
| 927 |
+
self._history.clear()
|
| 928 |
+
|
| 929 |
+
snapshot = TASK_SNAPSHOTS.get(task_id, {})
|
| 930 |
+
# Start from shared base, then overlay task-specific files
|
| 931 |
+
merged: Dict[str, Dict[str, str]] = {}
|
| 932 |
+
for service, files in SHARED_FILES.items():
|
| 933 |
+
merged[service] = dict(files)
|
| 934 |
+
for service, files in snapshot.items():
|
| 935 |
+
if service not in merged:
|
| 936 |
+
merged[service] = {}
|
| 937 |
+
merged[service].update(files)
|
| 938 |
+
|
| 939 |
+
self._files = merged
|
| 940 |
+
|
| 941 |
+
# ------------------------------------------------------------------
|
| 942 |
+
# Read
|
| 943 |
+
# ------------------------------------------------------------------
|
| 944 |
+
|
| 945 |
+
def list_files(self, service: str) -> List[str]:
|
| 946 |
+
return sorted(self._files.get(service, {}).keys())
|
| 947 |
+
|
| 948 |
+
def list_services(self) -> List[str]:
|
| 949 |
+
return sorted(self._files.keys())
|
| 950 |
+
|
| 951 |
+
def read_file(self, service: str, filename: str) -> Tuple[bool, str]:
|
| 952 |
+
"""Return (found, content)."""
|
| 953 |
+
content = self._files.get(service, {}).get(filename)
|
| 954 |
+
if content is None:
|
| 955 |
+
return False, f"File not found: {service}/{filename}"
|
| 956 |
+
return True, content
|
| 957 |
+
|
| 958 |
+
def get_file_lines(self, service: str, filename: str) -> Optional[List[str]]:
|
| 959 |
+
found, content = self.read_file(service, filename)
|
| 960 |
+
if not found:
|
| 961 |
+
return None
|
| 962 |
+
return content.splitlines()
|
| 963 |
+
|
| 964 |
+
# ------------------------------------------------------------------
|
| 965 |
+
# Write
|
| 966 |
+
# ------------------------------------------------------------------
|
| 967 |
+
|
| 968 |
+
def edit_line(
|
| 969 |
+
self,
|
| 970 |
+
service: str,
|
| 971 |
+
filename: str,
|
| 972 |
+
line_number: int, # 1-based
|
| 973 |
+
new_code: str,
|
| 974 |
+
step: int = 0,
|
| 975 |
+
) -> Tuple[bool, str]:
|
| 976 |
+
"""Replace a single line (1-based). Returns (success, message)."""
|
| 977 |
+
lines = self.get_file_lines(service, filename)
|
| 978 |
+
if lines is None:
|
| 979 |
+
return False, f"File not found: {service}/{filename}"
|
| 980 |
+
|
| 981 |
+
idx = line_number - 1
|
| 982 |
+
if not (0 <= idx < len(lines)):
|
| 983 |
+
return False, f"Line {line_number} out of range (file has {len(lines)} lines)"
|
| 984 |
+
|
| 985 |
+
old_code = lines[idx]
|
| 986 |
+
lines[idx] = new_code
|
| 987 |
+
self._files[service][filename] = "\n".join(lines)
|
| 988 |
+
|
| 989 |
+
self._history.append(
|
| 990 |
+
EditRecord(step, service, filename, idx, old_code, new_code)
|
| 991 |
+
)
|
| 992 |
+
return True, "ok"
|
| 993 |
+
|
| 994 |
+
# ------------------------------------------------------------------
|
| 995 |
+
# History / blame
|
| 996 |
+
# ------------------------------------------------------------------
|
| 997 |
+
|
| 998 |
+
def get_edit_history(
|
| 999 |
+
self,
|
| 1000 |
+
service: Optional[str] = None,
|
| 1001 |
+
filename: Optional[str] = None,
|
| 1002 |
+
) -> List[dict]:
|
| 1003 |
+
records = self._history
|
| 1004 |
+
if service:
|
| 1005 |
+
records = [r for r in records if r.service == service]
|
| 1006 |
+
if filename:
|
| 1007 |
+
records = [r for r in records if r.filename == filename]
|
| 1008 |
+
return [r.to_dict() for r in records]
|
| 1009 |
+
|
| 1010 |
+
def git_blame(self, service: str, filename: str, line_number: int) -> str:
|
| 1011 |
+
"""Return the last edit record for a specific line, or 'AI-generated' if untouched."""
|
| 1012 |
+
idx = line_number - 1
|
| 1013 |
+
matching = [
|
| 1014 |
+
r for r in reversed(self._history)
|
| 1015 |
+
if r.service == service and r.filename == filename and r.line_idx == idx
|
| 1016 |
+
]
|
| 1017 |
+
if matching:
|
| 1018 |
+
r = matching[0]
|
| 1019 |
+
return (
|
| 1020 |
+
f"Step {r.step}: agent changed line {line_number} in "
|
| 1021 |
+
f"{service}/{filename}\n"
|
| 1022 |
+
f" - {r.old_code!r}\n"
|
| 1023 |
+
f" + {r.new_code!r}"
|
| 1024 |
+
)
|
| 1025 |
+
return (
|
| 1026 |
+
f"Line {line_number} in {service}/{filename} was last modified by: "
|
| 1027 |
+
f"Junior AI code-gen bot (commit a3f91b2, 2026-04-23 02:14 UTC)"
|
| 1028 |
+
)
|
| 1029 |
+
|
| 1030 |
+
def build_git_diff(self) -> Optional[str]:
|
| 1031 |
+
if not self._history:
|
| 1032 |
+
return None
|
| 1033 |
+
lines = [f"--- Task {self._task_id} working diff ---"]
|
| 1034 |
+
for r in self._history:
|
| 1035 |
+
lines.append(
|
| 1036 |
+
f"@@ {r.service}/{r.filename} line {r.line_idx + 1} @@\n"
|
| 1037 |
+
f"-{r.old_code}\n"
|
| 1038 |
+
f"+{r.new_code}"
|
| 1039 |
+
)
|
| 1040 |
+
return "\n".join(lines)
|
app/main.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Meta-SRE FastAPI Server – OpenEnv Standard API.
|
| 3 |
+
|
| 4 |
+
Implements the OpenEnv contract exactly:
|
| 5 |
+
POST /reset → Observation
|
| 6 |
+
POST /step → (observation, reward, done, info)
|
| 7 |
+
GET /state → Observation
|
| 8 |
+
GET /grade → EpisodeResult
|
| 9 |
+
GET /tools → tool specs (JSON Schema per tool)
|
| 10 |
+
GET /tasks → task definitions
|
| 11 |
+
GET /health → liveness probe
|
| 12 |
+
|
| 13 |
+
The /env/* routes are strict OpenEnv aliases used by openenv_client.connect().
|
| 14 |
+
"""
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
from fastapi import FastAPI, HTTPException
|
| 18 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
+
from pydantic import BaseModel
|
| 20 |
+
from typing import Any, Dict, Optional
|
| 21 |
+
|
| 22 |
+
from app.engine.manager import EpisodeManager, TASK_DEFINITIONS, DifficultyController
|
| 23 |
+
from app.models import Observation, ActionResult, EpisodeResult
|
| 24 |
+
from app.tools.definitions import TOOL_SPECS
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
# App setup
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
|
| 31 |
+
app = FastAPI(
|
| 32 |
+
title="Meta-SRE",
|
| 33 |
+
description=(
|
| 34 |
+
"OpenEnv environment: train LLM agents to act as Senior Site Reliability Engineers "
|
| 35 |
+
"debugging realistic Meta production incidents."
|
| 36 |
+
),
|
| 37 |
+
version="1.0.0",
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
app.add_middleware(
|
| 41 |
+
CORSMiddleware,
|
| 42 |
+
allow_origins=["*"],
|
| 43 |
+
allow_methods=["*"],
|
| 44 |
+
allow_headers=["*"],
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Single global episode manager (stateful server)
|
| 48 |
+
_dc = DifficultyController()
|
| 49 |
+
_episode = EpisodeManager(difficulty_controller=_dc)
|
| 50 |
+
_started = False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
# Request / response models
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
class ResetRequest(BaseModel):
|
| 58 |
+
task_id: Optional[int] = None # 1-5; None = difficulty-controller picks
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class StepRequest(BaseModel):
|
| 62 |
+
tool: str
|
| 63 |
+
params: Dict[str, Any] = {}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
# Routes
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
|
| 70 |
+
@app.get("/health")
|
| 71 |
+
def health():
|
| 72 |
+
return {"status": "ok", "version": "1.0.0"}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@app.get("/tools")
|
| 76 |
+
def list_tools():
|
| 77 |
+
return {"tools": TOOL_SPECS, "count": len(TOOL_SPECS)}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@app.get("/tasks")
|
| 81 |
+
def list_tasks():
|
| 82 |
+
return {"tasks": TASK_DEFINITIONS}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@app.post("/reset", response_model=Observation)
|
| 86 |
+
def reset(req: ResetRequest = ResetRequest()):
|
| 87 |
+
global _started
|
| 88 |
+
_started = True
|
| 89 |
+
obs = _episode.reset(task_id=req.task_id)
|
| 90 |
+
return obs
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.post("/step", response_model=ActionResult)
|
| 94 |
+
def step(req: StepRequest):
|
| 95 |
+
global _started
|
| 96 |
+
if not _started:
|
| 97 |
+
raise HTTPException(
|
| 98 |
+
status_code=400,
|
| 99 |
+
detail="Episode not started. Call POST /reset first."
|
| 100 |
+
)
|
| 101 |
+
try:
|
| 102 |
+
result = _episode.step(tool=req.tool, params=req.params)
|
| 103 |
+
return result
|
| 104 |
+
except RuntimeError as e:
|
| 105 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@app.get("/state", response_model=Observation)
|
| 109 |
+
def get_state():
|
| 110 |
+
if not _started:
|
| 111 |
+
raise HTTPException(
|
| 112 |
+
status_code=400,
|
| 113 |
+
detail="Episode not started. Call POST /reset first."
|
| 114 |
+
)
|
| 115 |
+
return _episode._build_observation()
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@app.get("/grade", response_model=EpisodeResult)
|
| 119 |
+
def grade():
|
| 120 |
+
if not _started:
|
| 121 |
+
raise HTTPException(
|
| 122 |
+
status_code=400,
|
| 123 |
+
detail="Episode not started. Call POST /reset first."
|
| 124 |
+
)
|
| 125 |
+
return _episode.get_episode_result()
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# ---------------------------------------------------------------------------
|
| 129 |
+
# OpenEnv compatibility shim (env.reset / env.step / env.grade)
|
| 130 |
+
# ---------------------------------------------------------------------------
|
| 131 |
+
|
| 132 |
+
@app.post("/env/reset")
|
| 133 |
+
def env_reset(req: ResetRequest = ResetRequest()):
|
| 134 |
+
"""OpenEnv spec alias for /reset."""
|
| 135 |
+
return reset(req)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
@app.post("/env/step")
|
| 139 |
+
def env_step(req: StepRequest):
|
| 140 |
+
"""
|
| 141 |
+
OpenEnv standard step — returns the canonical 4-tuple:
|
| 142 |
+
(observation, reward, done, info)
|
| 143 |
+
This is what openenv_client.connect().step() unpacks.
|
| 144 |
+
"""
|
| 145 |
+
result: ActionResult = step(req)
|
| 146 |
+
return {
|
| 147 |
+
"observation": result.observation,
|
| 148 |
+
"reward": result.reward_delta,
|
| 149 |
+
"done": result.done,
|
| 150 |
+
"info": {
|
| 151 |
+
"tool": result.tool,
|
| 152 |
+
"output": result.output,
|
| 153 |
+
"episode_id": _episode._incident_id,
|
| 154 |
+
"step": _episode._step,
|
| 155 |
+
"budget_remaining": max(0, _episode._build_observation().budget_remaining),
|
| 156 |
+
},
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ---------------------------------------------------------------------------
|
| 161 |
+
# Dev entry point
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
|
| 164 |
+
if __name__ == "__main__":
|
| 165 |
+
import uvicorn
|
| 166 |
+
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
|
app/models.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from typing import Dict, List, Optional, Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ServiceMetrics(BaseModel):
|
| 7 |
+
cpu_percent: float
|
| 8 |
+
memory_mb: float
|
| 9 |
+
error_rate: float # errors per second
|
| 10 |
+
p99_latency_ms: float
|
| 11 |
+
request_queue: int
|
| 12 |
+
last_deploy: str
|
| 13 |
+
status: str # healthy | degraded | critical | down
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Alert(BaseModel):
|
| 17 |
+
alert_id: str
|
| 18 |
+
severity: str # P0 | P1 | P2
|
| 19 |
+
service: str
|
| 20 |
+
message: str
|
| 21 |
+
triggered_at_step: int
|
| 22 |
+
is_red_herring: bool = False
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class FileView(BaseModel):
|
| 26 |
+
service: str
|
| 27 |
+
filename: str
|
| 28 |
+
content: str
|
| 29 |
+
total_lines: int
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class EditRecord(BaseModel):
|
| 33 |
+
step: int
|
| 34 |
+
service: str
|
| 35 |
+
filename: str
|
| 36 |
+
line_number: int
|
| 37 |
+
old_code: str
|
| 38 |
+
new_code: str
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class TestResult(BaseModel):
|
| 42 |
+
suite: str # unit | integration | load | security
|
| 43 |
+
passed: bool
|
| 44 |
+
output: str
|
| 45 |
+
errors: List[str] = Field(default_factory=list)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class Observation(BaseModel):
|
| 49 |
+
step: int
|
| 50 |
+
incident_id: str
|
| 51 |
+
system_metrics: Dict[str, ServiceMetrics]
|
| 52 |
+
active_alerts: List[Alert]
|
| 53 |
+
open_file: Optional[FileView] = None
|
| 54 |
+
terminal_output: str
|
| 55 |
+
git_diff: Optional[str] = None
|
| 56 |
+
dependency_graph: Dict[str, List[str]]
|
| 57 |
+
sre_memory: List[str] = Field(default_factory=list)
|
| 58 |
+
budget_remaining: int
|
| 59 |
+
task_id: int
|
| 60 |
+
task_description: str
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class ActionRequest(BaseModel):
|
| 64 |
+
tool: str
|
| 65 |
+
params: Dict[str, Any] = Field(default_factory=dict)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class ActionResult(BaseModel):
|
| 69 |
+
tool: str
|
| 70 |
+
output: Any
|
| 71 |
+
reward_delta: float
|
| 72 |
+
done: bool
|
| 73 |
+
observation: Observation
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class EpisodeResult(BaseModel):
|
| 77 |
+
incident_id: str
|
| 78 |
+
task_id: int
|
| 79 |
+
steps_taken: int
|
| 80 |
+
total_reward: float
|
| 81 |
+
normalized_score: float # 0.0 – 1.0
|
| 82 |
+
tests_passed: bool
|
| 83 |
+
incident_report_accuracy: float
|
| 84 |
+
fixed_within_sla: bool
|
| 85 |
+
tool_call_log: List[Dict[str, Any]]
|
| 86 |
+
weakness_tags: List[str] # for DifficultyController
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class IncidentReport(BaseModel):
|
| 90 |
+
root_cause: str
|
| 91 |
+
fix_applied: str
|
| 92 |
+
services_affected: List[str]
|
| 93 |
+
severity_classification: str # P0 | P1 | P2
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class DifficultyState(BaseModel):
|
| 97 |
+
async_bugs: float = 0.5
|
| 98 |
+
data_corruption: float = 0.5
|
| 99 |
+
security_bugs: float = 0.5
|
| 100 |
+
cascading_failures: float = 0.5
|
| 101 |
+
red_herrings: float = 0.5
|
app/services/__init__.py
ADDED
|
File without changes
|
app/services/ad_ranking/__init__.py
ADDED
|
File without changes
|
app/services/capi_pipeline/__init__.py
ADDED
|
File without changes
|
app/services/whatsapp_sync/__init__.py
ADDED
|
File without changes
|
app/tools/__init__.py
ADDED
|
File without changes
|
app/tools/definitions.py
ADDED
|
@@ -0,0 +1,761 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
All 10 agent tools — implemented as plain Python functions wrapped in ToolDispatcher.
|
| 3 |
+
|
| 4 |
+
Each tool returns (reward_delta: float, done: bool, output: Any).
|
| 5 |
+
The EpisodeManager calls ToolDispatcher.dispatch(tool, params).
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
import re
|
| 9 |
+
from typing import TYPE_CHECKING, Any, Dict, Tuple
|
| 10 |
+
|
| 11 |
+
if TYPE_CHECKING:
|
| 12 |
+
from app.engine.manager import EpisodeManager
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ---------------------------------------------------------------------------
|
| 16 |
+
# Schema exposed to the LLM (OpenEnv tool_spec format)
|
| 17 |
+
# ---------------------------------------------------------------------------
|
| 18 |
+
|
| 19 |
+
TOOL_SPECS = [
|
| 20 |
+
{
|
| 21 |
+
"name": "view_file",
|
| 22 |
+
"description": (
|
| 23 |
+
"Read the contents of a file in a service codebase. "
|
| 24 |
+
"Use this BEFORE editing to understand the code."
|
| 25 |
+
),
|
| 26 |
+
"parameters": {
|
| 27 |
+
"type": "object",
|
| 28 |
+
"properties": {
|
| 29 |
+
"service": {"type": "string", "enum": ["ad_ranking", "capi_pipeline", "whatsapp_sync"]},
|
| 30 |
+
"filename": {"type": "string", "description": "e.g. ranker.py"},
|
| 31 |
+
},
|
| 32 |
+
"required": ["service", "filename"],
|
| 33 |
+
},
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"name": "edit_line",
|
| 37 |
+
"description": (
|
| 38 |
+
"Replace a single line in a file. SURGICAL edits only — "
|
| 39 |
+
"do NOT rewrite whole functions. One line at a time."
|
| 40 |
+
),
|
| 41 |
+
"parameters": {
|
| 42 |
+
"type": "object",
|
| 43 |
+
"properties": {
|
| 44 |
+
"service": {"type": "string"},
|
| 45 |
+
"filename": {"type": "string"},
|
| 46 |
+
"line_number": {"type": "integer", "description": "1-based line number"},
|
| 47 |
+
"new_code": {"type": "string", "description": "Replacement line (preserve indentation)"},
|
| 48 |
+
},
|
| 49 |
+
"required": ["service", "filename", "line_number", "new_code"],
|
| 50 |
+
},
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"name": "run_tests",
|
| 54 |
+
"description": (
|
| 55 |
+
"Execute a test suite for a service. "
|
| 56 |
+
"suite options: 'unit' (fast, 1 step), "
|
| 57 |
+
"'integration' (2 steps), 'load' (3 steps), 'security' (2 steps)."
|
| 58 |
+
),
|
| 59 |
+
"parameters": {
|
| 60 |
+
"type": "object",
|
| 61 |
+
"properties": {
|
| 62 |
+
"service": {"type": "string"},
|
| 63 |
+
"suite": {"type": "string", "enum": ["unit", "integration", "load", "security"],
|
| 64 |
+
"default": "unit"},
|
| 65 |
+
},
|
| 66 |
+
"required": ["service"],
|
| 67 |
+
},
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "check_dependency",
|
| 71 |
+
"description": "Show the data-flow relationship between two services.",
|
| 72 |
+
"parameters": {
|
| 73 |
+
"type": "object",
|
| 74 |
+
"properties": {
|
| 75 |
+
"service_a": {"type": "string"},
|
| 76 |
+
"service_b": {"type": "string"},
|
| 77 |
+
},
|
| 78 |
+
"required": ["service_a", "service_b"],
|
| 79 |
+
},
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"name": "read_logs",
|
| 83 |
+
"description": "Pull recent logs for a service filtered by log level.",
|
| 84 |
+
"parameters": {
|
| 85 |
+
"type": "object",
|
| 86 |
+
"properties": {
|
| 87 |
+
"service": {"type": "string"},
|
| 88 |
+
"log_level": {"type": "string", "enum": ["ERROR", "WARN", "INFO", "DEBUG"],
|
| 89 |
+
"default": "ERROR"},
|
| 90 |
+
"last_n_lines": {"type": "integer", "default": 20},
|
| 91 |
+
},
|
| 92 |
+
"required": ["service"],
|
| 93 |
+
},
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"name": "git_blame",
|
| 97 |
+
"description": "Find who/what last changed a specific line — reveals AI-generated code.",
|
| 98 |
+
"parameters": {
|
| 99 |
+
"type": "object",
|
| 100 |
+
"properties": {
|
| 101 |
+
"service": {"type": "string"},
|
| 102 |
+
"filename": {"type": "string"},
|
| 103 |
+
"line_number": {"type": "integer"},
|
| 104 |
+
},
|
| 105 |
+
"required": ["service", "filename", "line_number"],
|
| 106 |
+
},
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"name": "rollback",
|
| 110 |
+
"description": (
|
| 111 |
+
"Roll back a service's database migration by version string. "
|
| 112 |
+
"HIGH COST — use only when a bad migration is the root cause."
|
| 113 |
+
),
|
| 114 |
+
"parameters": {
|
| 115 |
+
"type": "object",
|
| 116 |
+
"properties": {
|
| 117 |
+
"service": {"type": "string"},
|
| 118 |
+
"version": {"type": "string", "description": "Migration version, e.g. '003'"},
|
| 119 |
+
},
|
| 120 |
+
"required": ["service", "version"],
|
| 121 |
+
},
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"name": "query_metrics_history",
|
| 125 |
+
"description": "Show how a metric changed over time — reveals when the problem started.",
|
| 126 |
+
"parameters": {
|
| 127 |
+
"type": "object",
|
| 128 |
+
"properties": {
|
| 129 |
+
"service": {"type": "string"},
|
| 130 |
+
"metric": {"type": "string",
|
| 131 |
+
"enum": ["cpu_percent", "memory_mb", "error_rate",
|
| 132 |
+
"p99_latency_ms", "request_queue"]},
|
| 133 |
+
"hours_back": {"type": "integer", "default": 6},
|
| 134 |
+
},
|
| 135 |
+
"required": ["service", "metric"],
|
| 136 |
+
},
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"name": "ask_senior_sre",
|
| 140 |
+
"description": (
|
| 141 |
+
"Ask the on-call Senior SRE for a hint. "
|
| 142 |
+
"Costs 2 reward steps. Use when genuinely stuck."
|
| 143 |
+
),
|
| 144 |
+
"parameters": {
|
| 145 |
+
"type": "object",
|
| 146 |
+
"properties": {
|
| 147 |
+
"question": {"type": "string"},
|
| 148 |
+
},
|
| 149 |
+
"required": ["question"],
|
| 150 |
+
},
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"name": "write_incident_report",
|
| 154 |
+
"description": (
|
| 155 |
+
"Close the incident by submitting a post-mortem report. "
|
| 156 |
+
"MUST be called after fixing the bug to end the episode."
|
| 157 |
+
),
|
| 158 |
+
"parameters": {
|
| 159 |
+
"type": "object",
|
| 160 |
+
"properties": {
|
| 161 |
+
"root_cause": {"type": "string"},
|
| 162 |
+
"fix_applied": {"type": "string"},
|
| 163 |
+
"services_affected": {"type": "array", "items": {"type": "string"}},
|
| 164 |
+
"severity_classification": {"type": "string", "enum": ["P0", "P1", "P2"]},
|
| 165 |
+
},
|
| 166 |
+
"required": ["root_cause", "fix_applied", "services_affected", "severity_classification"],
|
| 167 |
+
},
|
| 168 |
+
},
|
| 169 |
+
]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ---------------------------------------------------------------------------
|
| 173 |
+
# Per-task contextual log data
|
| 174 |
+
# ---------------------------------------------------------------------------
|
| 175 |
+
|
| 176 |
+
_TASK_LOGS: Dict[int, Dict[str, Dict[str, str]]] = {
|
| 177 |
+
1: {
|
| 178 |
+
"ad_ranking": {
|
| 179 |
+
"ERROR": (
|
| 180 |
+
"[2026-04-24 03:14:21] ERROR ad_ranking.ranker: "
|
| 181 |
+
"AttributeError: 'dict' object has no attribute 'get_clicks'\n"
|
| 182 |
+
" File ranker.py, line 22, in score_ads\n"
|
| 183 |
+
" click_rate = ad.get_clicks() / max(ad.get('impressions', 1), 1)\n"
|
| 184 |
+
"[2026-04-24 03:14:22] ERROR ad_ranking.ranker: same error (x487 in last 60s)"
|
| 185 |
+
),
|
| 186 |
+
"DEBUG": (
|
| 187 |
+
"[2026-04-24 03:14:20] DEBUG ad_ranking.ranker: fetch_candidate_ads returned 12 ads\n"
|
| 188 |
+
"[2026-04-24 03:14:21] DEBUG ad_ranking.ranker: entering score_ads with 12 ads\n"
|
| 189 |
+
"[2026-04-24 03:14:21] DEBUG ad_ranking.ranker: processing ad_id=ad_001 — CRASH"
|
| 190 |
+
),
|
| 191 |
+
},
|
| 192 |
+
},
|
| 193 |
+
2: {
|
| 194 |
+
"capi_pipeline": {
|
| 195 |
+
"WARN": (
|
| 196 |
+
"[2026-04-24 03:00:05] WARN capi_pipeline.transformer: "
|
| 197 |
+
"event_time 1700000000 converted to 1700000 — check threshold\n"
|
| 198 |
+
"[2026-04-24 03:00:05] WARN capi_pipeline.transformer: "
|
| 199 |
+
"event_time 1745392000 converted to 1745392 — data from 1970-01-20"
|
| 200 |
+
),
|
| 201 |
+
"DEBUG": (
|
| 202 |
+
"[2026-04-24 02:14:03] DEBUG capi_pipeline.transformer: "
|
| 203 |
+
"_normalize_timestamp called with ts=1700000000\n"
|
| 204 |
+
"[2026-04-24 02:14:03] DEBUG capi_pipeline.transformer: "
|
| 205 |
+
"ts > 1_000_000_000 → True, returning ts // 1000 = 1700000\n"
|
| 206 |
+
"[2026-04-24 02:14:03] DEBUG capi_pipeline.transformer: "
|
| 207 |
+
"EXPECTED: ts > 1_000_000_000_000 for millisecond timestamps"
|
| 208 |
+
),
|
| 209 |
+
"ERROR": "[2026-04-24 03:00:00] INFO capi_pipeline: No errors — pipeline healthy",
|
| 210 |
+
},
|
| 211 |
+
"ad_ranking": {
|
| 212 |
+
"WARN": (
|
| 213 |
+
"[2026-04-24 03:01:00] WARN ad_ranking.ranker: "
|
| 214 |
+
"ROAS attribution window: events from 1970-01-20 (expected: 2023+)\n"
|
| 215 |
+
"[2026-04-24 03:01:01] WARN ad_ranking.attribution: "
|
| 216 |
+
"Conversion events all timestamped <86400 (one day in 1970)"
|
| 217 |
+
),
|
| 218 |
+
},
|
| 219 |
+
},
|
| 220 |
+
3: {
|
| 221 |
+
"whatsapp_sync": {
|
| 222 |
+
"ERROR": (
|
| 223 |
+
"[2026-04-24 03:10:00] ERROR whatsapp_sync.handler: "
|
| 224 |
+
"asyncpg.exceptions.TooManyConnectionsError: pool exhausted\n"
|
| 225 |
+
"[2026-04-24 03:10:02] ERROR whatsapp_sync.handler: "
|
| 226 |
+
"sync_user_messages acquire() blocked for user_id=8841923\n"
|
| 227 |
+
"[2026-04-24 03:10:05] ERROR whatsapp_sync.handler: "
|
| 228 |
+
"490/500 connections allocated — 0 available"
|
| 229 |
+
),
|
| 230 |
+
"DEBUG": (
|
| 231 |
+
"[2026-04-24 03:09:00] DEBUG whatsapp_sync.handler: "
|
| 232 |
+
"sync_user_messages — db_pool.acquire() called\n"
|
| 233 |
+
"[2026-04-24 03:09:00] DEBUG whatsapp_sync.handler: "
|
| 234 |
+
"sync_user_messages — conn acquired, fetching messages\n"
|
| 235 |
+
"[2026-04-24 03:09:00] DEBUG whatsapp_sync.handler: "
|
| 236 |
+
"sync_user_messages — messages fetched, returning\n"
|
| 237 |
+
"NOTE: No 'release' log line — connection never returned to pool"
|
| 238 |
+
),
|
| 239 |
+
},
|
| 240 |
+
},
|
| 241 |
+
4: {
|
| 242 |
+
"whatsapp_sync": {
|
| 243 |
+
"ERROR": (
|
| 244 |
+
"[2026-04-24 02:14:31] ERROR whatsapp_sync.db: "
|
| 245 |
+
"asyncpg.ForeignKeyViolationError: "
|
| 246 |
+
"insert into user_preferences violates FK constraint\n"
|
| 247 |
+
"[2026-04-24 02:14:31] ERROR whatsapp_sync.db: "
|
| 248 |
+
"migration 003 failed — circular FK: messages ↔ message_threads\n"
|
| 249 |
+
"[2026-04-24 02:14:31] ERROR whatsapp_sync.db: "
|
| 250 |
+
"ALTER TABLE messages failed — message_threads.id referenced before table commit"
|
| 251 |
+
),
|
| 252 |
+
},
|
| 253 |
+
"ad_ranking": {
|
| 254 |
+
"ERROR": (
|
| 255 |
+
"[2026-04-24 02:15:00] ERROR ad_ranking: "
|
| 256 |
+
"DB pool returning FK violation errors from upstream\n"
|
| 257 |
+
"[2026-04-24 02:15:01] WARN ad_ranking: "
|
| 258 |
+
"This is a SYMPTOM — root cause is in whatsapp_sync migration"
|
| 259 |
+
),
|
| 260 |
+
},
|
| 261 |
+
"capi_pipeline": {
|
| 262 |
+
"WARN": (
|
| 263 |
+
"[2026-04-24 02:15:00] WARN capi_pipeline: "
|
| 264 |
+
"Event association latency +340ms — DB pool contention\n"
|
| 265 |
+
"[2026-04-24 02:15:00] WARN capi_pipeline: "
|
| 266 |
+
"This is a SYMPTOM — root cause is in whatsapp_sync migration"
|
| 267 |
+
),
|
| 268 |
+
},
|
| 269 |
+
},
|
| 270 |
+
5: {
|
| 271 |
+
"capi_pipeline": {
|
| 272 |
+
"DEBUG": (
|
| 273 |
+
"[2026-04-24 02:00:00] DEBUG capi_pipeline.ingestor: "
|
| 274 |
+
"DEBUG_MODE=True — including raw payload in response\n"
|
| 275 |
+
"[2026-04-24 02:00:00] DEBUG capi_pipeline.ingestor: "
|
| 276 |
+
"Response size: 14,382 bytes (expected ~48 bytes)\n"
|
| 277 |
+
"[2026-04-24 02:00:01] DEBUG capi_pipeline.ingestor: "
|
| 278 |
+
"debug_data.user_emails contains plaintext email fields"
|
| 279 |
+
),
|
| 280 |
+
"ERROR": "[2026-04-24 02:00:00] INFO capi_pipeline: No errors — unit tests all pass",
|
| 281 |
+
},
|
| 282 |
+
},
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
_METRICS_HISTORY: Dict[str, Dict[str, list]] = {
|
| 286 |
+
"ad_ranking:error_rate": [
|
| 287 |
+
(0, 0.0), (1, 0.0), (2, 0.0), (3, 12.3), (4, 12.1), (5, 11.9),
|
| 288 |
+
],
|
| 289 |
+
"capi_pipeline:error_rate": [
|
| 290 |
+
(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0),
|
| 291 |
+
],
|
| 292 |
+
"whatsapp_sync:memory_mb": [
|
| 293 |
+
(0, 200), (1, 250), (2, 350), (3, 500), (4, 800), (5, 1200),
|
| 294 |
+
],
|
| 295 |
+
"whatsapp_sync:request_queue": [
|
| 296 |
+
(0, 5), (1, 45), (2, 130), (3, 280), (4, 420), (5, 490),
|
| 297 |
+
],
|
| 298 |
+
"capi_pipeline:p99_latency_ms": [
|
| 299 |
+
(0, 20), (1, 20), (2, 21), (3, 1100), (4, 1080), (5, 1090),
|
| 300 |
+
],
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# ---------------------------------------------------------------------------
|
| 305 |
+
# Senior SRE hints — contextually aware
|
| 306 |
+
# ---------------------------------------------------------------------------
|
| 307 |
+
|
| 308 |
+
def _senior_sre_hint(task_id: int, question: str, sre_memory: list, step: int) -> str:
|
| 309 |
+
question_lower = question.lower()
|
| 310 |
+
memory_text = " ".join(sre_memory).lower()
|
| 311 |
+
|
| 312 |
+
if task_id == 1:
|
| 313 |
+
if "get_clicks" in memory_text or "attributeerror" in question_lower:
|
| 314 |
+
return (
|
| 315 |
+
"Senior SRE: The AttributeError is very specific — 'dict' has no method "
|
| 316 |
+
"'get_clicks'. The ad objects coming from the API are plain Python dicts. "
|
| 317 |
+
"You need dict accessor syntax, not method call syntax. "
|
| 318 |
+
"Check line 22 of ranker.py."
|
| 319 |
+
)
|
| 320 |
+
return (
|
| 321 |
+
"Senior SRE: Look at the stack trace carefully. "
|
| 322 |
+
"The error is on the line that computes click_rate. "
|
| 323 |
+
"How are you accessing the 'clicks' field on the ad object?"
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
if task_id == 2:
|
| 327 |
+
if "timestamp" in memory_text or "1970" in question_lower or "normalize" in question_lower:
|
| 328 |
+
return (
|
| 329 |
+
"Senior SRE: The timestamp normalisation logic has an off-by-three-orders-of-magnitude "
|
| 330 |
+
"bug. A Unix second timestamp is ~10 digits. A Unix millisecond timestamp is ~13 digits. "
|
| 331 |
+
"The condition in _normalize_timestamp() uses the wrong threshold. "
|
| 332 |
+
"What number has 13 digits?"
|
| 333 |
+
)
|
| 334 |
+
if "capi" in memory_text or "capi" in question_lower:
|
| 335 |
+
return (
|
| 336 |
+
"Senior SRE: You're on the right track — CAPI is the data source for ad attribution. "
|
| 337 |
+
"Check the transformer.py file. The event_time values being emitted are wrong — "
|
| 338 |
+
"they look like they're in 1970. Where does event_time get processed?"
|
| 339 |
+
)
|
| 340 |
+
return (
|
| 341 |
+
"Senior SRE: The ad ranking ROAS drop is NOT a ranking algorithm bug. "
|
| 342 |
+
"The ranking model is working correctly — it just has bad input data. "
|
| 343 |
+
"Follow the data upstream. Where do conversion events come from?"
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
if task_id == 3:
|
| 347 |
+
if "finally" in memory_text or "release" in question_lower or "pool" in question_lower:
|
| 348 |
+
return (
|
| 349 |
+
"Senior SRE: Yes — the connection pool is exhausted because connections are "
|
| 350 |
+
"acquired but never released. In Python async code, you MUST release connections "
|
| 351 |
+
"in a 'finally' block, otherwise an exception will skip the release call entirely. "
|
| 352 |
+
"Add: finally: await self.db_pool.release(conn)"
|
| 353 |
+
)
|
| 354 |
+
return (
|
| 355 |
+
"Senior SRE: The load test shows pool exhaustion. "
|
| 356 |
+
"Under normal load each sync call is short so you don't notice. "
|
| 357 |
+
"Under high load, the missing resource cleanup compounds. "
|
| 358 |
+
"Look at how sync_user_messages() handles its DB connection lifecycle."
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
if task_id == 4:
|
| 362 |
+
if step > 3 and "migration" not in memory_text:
|
| 363 |
+
return (
|
| 364 |
+
"Senior SRE (impatient): Stop chasing symptoms! All three services degraded "
|
| 365 |
+
"simultaneously at 02:14 UTC — that's when the last deploy landed. "
|
| 366 |
+
"Check the DB migration logs. The whatsapp_sync service ran a new migration "
|
| 367 |
+
"at that exact time. Look at db.py migration 003."
|
| 368 |
+
)
|
| 369 |
+
if "migration" in memory_text or "003" in question_lower:
|
| 370 |
+
return (
|
| 371 |
+
"Senior SRE: Good, you found migration 003. Look at what it does: "
|
| 372 |
+
"it adds a column 'thread_id' to messages that references message_threads. "
|
| 373 |
+
"But message_threads also references messages. "
|
| 374 |
+
"That's a circular FK — PostgreSQL can't resolve the constraint. "
|
| 375 |
+
"Remove the ALTER TABLE statement from migration 003."
|
| 376 |
+
)
|
| 377 |
+
return (
|
| 378 |
+
"Senior SRE: Three services failing simultaneously at 02:14 UTC is not a coincidence. "
|
| 379 |
+
"Check the deploy logs and DB migration history for that exact timestamp."
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
if task_id == 5:
|
| 383 |
+
if "debug" in memory_text or "pii" in question_lower or "response" in question_lower:
|
| 384 |
+
return (
|
| 385 |
+
"Senior SRE: DEBUG_MODE = True should never reach production. "
|
| 386 |
+
"Find that flag in the ingestor and disable it. "
|
| 387 |
+
"The security test will verify the response body no longer contains 'debug_data'."
|
| 388 |
+
)
|
| 389 |
+
if step > 2:
|
| 390 |
+
return (
|
| 391 |
+
"Senior SRE: The unit tests pass — that's the trap. This is a security bug "
|
| 392 |
+
"invisible to unit tests. Run the security test suite instead: "
|
| 393 |
+
"run_tests('capi_pipeline', 'security'). "
|
| 394 |
+
"Read the DEBUG logs for the ingestor — you'll see the response sizes."
|
| 395 |
+
)
|
| 396 |
+
return (
|
| 397 |
+
"Senior SRE: Something is leaking data in the CAPI ingestor. "
|
| 398 |
+
"Response sizes are 70x larger than expected. "
|
| 399 |
+
"What conditions cause a larger response body?"
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
return "Senior SRE: Check the logs and follow the data flow upstream."
|
| 403 |
+
|
| 404 |
+
|
| 405 |
+
# ---------------------------------------------------------------------------
|
| 406 |
+
# Tool Dispatcher
|
| 407 |
+
# ---------------------------------------------------------------------------
|
| 408 |
+
|
| 409 |
+
class ToolDispatcher:
|
| 410 |
+
|
| 411 |
+
def __init__(self, episode: "EpisodeManager"):
|
| 412 |
+
self.ep = episode
|
| 413 |
+
|
| 414 |
+
def dispatch(self, tool: str, params: Dict[str, Any]) -> Tuple[float, bool, Any]:
|
| 415 |
+
"""Route to the correct tool. Returns (reward_delta, done, output)."""
|
| 416 |
+
handlers = {
|
| 417 |
+
"view_file": self._view_file,
|
| 418 |
+
"edit_line": self._edit_line,
|
| 419 |
+
"run_tests": self._run_tests,
|
| 420 |
+
"check_dependency": self._check_dependency,
|
| 421 |
+
"read_logs": self._read_logs,
|
| 422 |
+
"git_blame": self._git_blame,
|
| 423 |
+
"rollback": self._rollback,
|
| 424 |
+
"query_metrics_history": self._query_metrics_history,
|
| 425 |
+
"ask_senior_sre": self._ask_senior_sre,
|
| 426 |
+
"write_incident_report": self._write_incident_report,
|
| 427 |
+
}
|
| 428 |
+
fn = handlers.get(tool)
|
| 429 |
+
if fn is None:
|
| 430 |
+
r = self.ep.reward.step_reward(tool)
|
| 431 |
+
return r, False, f"Unknown tool: {tool}"
|
| 432 |
+
return fn(params)
|
| 433 |
+
|
| 434 |
+
# ------------------------------------------------------------------
|
| 435 |
+
# 1. view_file
|
| 436 |
+
# ------------------------------------------------------------------
|
| 437 |
+
def _view_file(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 438 |
+
service = p.get("service", "")
|
| 439 |
+
filename = p.get("filename", "")
|
| 440 |
+
found, content = self.ep.vfs.read_file(service, filename)
|
| 441 |
+
|
| 442 |
+
if not found:
|
| 443 |
+
r = self.ep.reward.step_reward("view_file")
|
| 444 |
+
return r, False, {"error": content}
|
| 445 |
+
|
| 446 |
+
lines = content.splitlines()
|
| 447 |
+
numbered = "\n".join(f"{i+1:4d} {line}" for i, line in enumerate(lines))
|
| 448 |
+
|
| 449 |
+
# Intermediate reward: opening the right file
|
| 450 |
+
task_def = {
|
| 451 |
+
1: ("ad_ranking", "ranker.py"),
|
| 452 |
+
2: ("capi_pipeline", "transformer.py"),
|
| 453 |
+
3: ("whatsapp_sync", "handler.py"),
|
| 454 |
+
4: ("whatsapp_sync", "db.py"),
|
| 455 |
+
5: ("capi_pipeline", "ingestor.py"),
|
| 456 |
+
}.get(self.ep._task_id)
|
| 457 |
+
|
| 458 |
+
r = self.ep.reward.step_reward("view_file")
|
| 459 |
+
if task_def and (service, filename) == task_def:
|
| 460 |
+
bonus = self.ep.reward.progress_reward("file_found")
|
| 461 |
+
r += bonus
|
| 462 |
+
self.ep.add_memory(f"opened root-cause file {service}/{filename}")
|
| 463 |
+
|
| 464 |
+
return r, False, {
|
| 465 |
+
"service": service,
|
| 466 |
+
"filename": filename,
|
| 467 |
+
"total_lines": len(lines),
|
| 468 |
+
"content": numbered,
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
# ------------------------------------------------------------------
|
| 472 |
+
# 2. edit_line
|
| 473 |
+
# ------------------------------------------------------------------
|
| 474 |
+
def _edit_line(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 475 |
+
service = p.get("service", "")
|
| 476 |
+
filename = p.get("filename", "")
|
| 477 |
+
line_number = int(p.get("line_number", 0))
|
| 478 |
+
new_code = p.get("new_code", "")
|
| 479 |
+
|
| 480 |
+
# Anti-cheat: SREs cannot modify test suites during an incident.
|
| 481 |
+
# Prevents reward hacking (e.g. deleting asserts to make tests pass).
|
| 482 |
+
PROTECTED = ("tests/", "test_", "_test.py", "conftest.py")
|
| 483 |
+
if any(guard in filename for guard in PROTECTED):
|
| 484 |
+
r = self.ep.reward.step_reward("edit_line", syntax_error=True)
|
| 485 |
+
return r, False, {
|
| 486 |
+
"error": (
|
| 487 |
+
"Error: SREs cannot modify test suites during an incident. "
|
| 488 |
+
"Fix the source code, not the tests."
|
| 489 |
+
)
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
success, msg = self.ep.vfs.edit_line(
|
| 493 |
+
service, filename, line_number, new_code, self.ep._step
|
| 494 |
+
)
|
| 495 |
+
if not success:
|
| 496 |
+
r = self.ep.reward.step_reward("edit_line")
|
| 497 |
+
return r, False, {"error": msg}
|
| 498 |
+
|
| 499 |
+
# Syntax check: look for obvious Python syntax errors in the new line
|
| 500 |
+
syntax_error = _has_syntax_error(new_code)
|
| 501 |
+
r = self.ep.reward.step_reward("edit_line", syntax_error=syntax_error)
|
| 502 |
+
|
| 503 |
+
self.ep.add_memory(
|
| 504 |
+
f"edited {service}/{filename} line {line_number}: "
|
| 505 |
+
f"{new_code[:60]!r}"
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
msg_out = f"Line {line_number} updated."
|
| 509 |
+
if syntax_error:
|
| 510 |
+
msg_out += " WARNING: possible syntax error detected in replacement line."
|
| 511 |
+
|
| 512 |
+
return r, False, {"result": msg_out, "syntax_warning": syntax_error}
|
| 513 |
+
|
| 514 |
+
# ------------------------------------------------------------------
|
| 515 |
+
# 3. run_tests
|
| 516 |
+
# ------------------------------------------------------------------
|
| 517 |
+
def _run_tests(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 518 |
+
service = p.get("service", "")
|
| 519 |
+
suite = p.get("suite", "unit")
|
| 520 |
+
|
| 521 |
+
# Suite cost (extra step penalties)
|
| 522 |
+
suite_cost = {"unit": 0, "integration": -0.1, "load": -0.2, "security": -0.1}
|
| 523 |
+
extra_cost = suite_cost.get(suite, 0)
|
| 524 |
+
|
| 525 |
+
passed, output, partial = self.ep.grader.run(self.ep._task_id, suite)
|
| 526 |
+
r = self.ep.reward.step_reward("run_tests") + extra_cost
|
| 527 |
+
|
| 528 |
+
self.ep._last_terminal = output
|
| 529 |
+
self.ep.add_memory(f"ran {suite} tests for {service}: {'PASS' if passed else 'FAIL'}")
|
| 530 |
+
|
| 531 |
+
if passed:
|
| 532 |
+
self.ep.metrics.mark_fixed(service)
|
| 533 |
+
r += self.ep.reward.progress_reward("error_drop")
|
| 534 |
+
|
| 535 |
+
return r, False, {"passed": passed, "suite": suite, "output": output}
|
| 536 |
+
|
| 537 |
+
# ------------------------------------------------------------------
|
| 538 |
+
# 4. check_dependency
|
| 539 |
+
# ------------------------------------------------------------------
|
| 540 |
+
def _check_dependency(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 541 |
+
from app.engine.manager import DEPENDENCY_GRAPH
|
| 542 |
+
a = p.get("service_a", "")
|
| 543 |
+
b = p.get("service_b", "")
|
| 544 |
+
deps_a = DEPENDENCY_GRAPH.get(a, [])
|
| 545 |
+
deps_b = DEPENDENCY_GRAPH.get(b, [])
|
| 546 |
+
|
| 547 |
+
r = self.ep.reward.step_reward("check_dependency")
|
| 548 |
+
|
| 549 |
+
relationship = "no direct dependency"
|
| 550 |
+
if b in deps_a:
|
| 551 |
+
relationship = f"{a} depends on {b} (data flows: {b} → {a})"
|
| 552 |
+
self.ep.add_memory(f"confirmed: {a} depends on {b}")
|
| 553 |
+
r += self.ep.reward.progress_reward("service_id")
|
| 554 |
+
elif a in deps_b:
|
| 555 |
+
relationship = f"{b} depends on {a} (data flows: {a} → {b})"
|
| 556 |
+
return r, False, {
|
| 557 |
+
"service_a": a,
|
| 558 |
+
"service_b": b,
|
| 559 |
+
"relationship": relationship,
|
| 560 |
+
f"{a}_depends_on": deps_a,
|
| 561 |
+
f"{b}_depends_on": deps_b,
|
| 562 |
+
}
|
| 563 |
+
|
| 564 |
+
# ------------------------------------------------------------------
|
| 565 |
+
# 5. read_logs
|
| 566 |
+
# ------------------------------------------------------------------
|
| 567 |
+
def _read_logs(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 568 |
+
service = p.get("service", "")
|
| 569 |
+
log_level = p.get("log_level", "ERROR")
|
| 570 |
+
n = int(p.get("last_n_lines", 20))
|
| 571 |
+
|
| 572 |
+
task_logs = _TASK_LOGS.get(self.ep._task_id, {})
|
| 573 |
+
svc_logs = task_logs.get(service, {})
|
| 574 |
+
log_text = svc_logs.get(log_level, f"[{log_level}] No {log_level} logs for {service}")
|
| 575 |
+
|
| 576 |
+
r = self.ep.reward.step_reward("read_logs")
|
| 577 |
+
self.ep.add_memory(f"read {log_level} logs for {service}")
|
| 578 |
+
|
| 579 |
+
# Partial reward for reading the right service's debug/error logs
|
| 580 |
+
right_service = {
|
| 581 |
+
1: "ad_ranking", 2: "capi_pipeline", 3: "whatsapp_sync",
|
| 582 |
+
4: "whatsapp_sync", 5: "capi_pipeline",
|
| 583 |
+
}.get(self.ep._task_id)
|
| 584 |
+
if service == right_service and log_level in ("DEBUG", "ERROR"):
|
| 585 |
+
r += self.ep.reward.progress_reward("service_id")
|
| 586 |
+
|
| 587 |
+
return r, False, {"service": service, "log_level": log_level, "logs": log_text}
|
| 588 |
+
|
| 589 |
+
# ------------------------------------------------------------------
|
| 590 |
+
# 6. git_blame
|
| 591 |
+
# ------------------------------------------------------------------
|
| 592 |
+
def _git_blame(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 593 |
+
service = p.get("service", "")
|
| 594 |
+
filename = p.get("filename", "")
|
| 595 |
+
line_number = int(p.get("line_number", 1))
|
| 596 |
+
|
| 597 |
+
blame = self.ep.vfs.git_blame(service, filename, line_number)
|
| 598 |
+
r = self.ep.reward.step_reward("git_blame")
|
| 599 |
+
self.ep.add_memory(f"git blame {service}/{filename}:{line_number}")
|
| 600 |
+
return r, False, {"blame": blame}
|
| 601 |
+
|
| 602 |
+
# ------------------------------------------------------------------
|
| 603 |
+
# 7. rollback
|
| 604 |
+
# ------------------------------------------------------------------
|
| 605 |
+
def _rollback(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 606 |
+
service = p.get("service", "")
|
| 607 |
+
version = p.get("version", "")
|
| 608 |
+
|
| 609 |
+
# Only valid for Task 4 and correct service/version
|
| 610 |
+
is_correct = (
|
| 611 |
+
self.ep._task_id == 4 and
|
| 612 |
+
service == "whatsapp_sync" and
|
| 613 |
+
version == "003"
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
if is_correct:
|
| 617 |
+
# Remove the circular FK from the VFS (simulate rollback)
|
| 618 |
+
_, content = self.ep.vfs.read_file("whatsapp_sync", "db.py")
|
| 619 |
+
# Strip migration 003 block
|
| 620 |
+
lines = content.splitlines()
|
| 621 |
+
new_lines = []
|
| 622 |
+
skip = False
|
| 623 |
+
for line in lines:
|
| 624 |
+
if '"version": "003"' in line or "'version': '003'" in line:
|
| 625 |
+
skip = True
|
| 626 |
+
if skip and line.strip().startswith("}"):
|
| 627 |
+
skip = False
|
| 628 |
+
continue
|
| 629 |
+
if not skip:
|
| 630 |
+
new_lines.append(line)
|
| 631 |
+
self.ep.vfs._files["whatsapp_sync"]["db.py"] = "\n".join(new_lines)
|
| 632 |
+
self.ep.metrics.mark_fixed("whatsapp_sync")
|
| 633 |
+
self.ep.metrics.mark_fixed("ad_ranking")
|
| 634 |
+
self.ep.metrics.mark_fixed("capi_pipeline")
|
| 635 |
+
self.ep.add_memory("rolled back migration 003 — circular FK removed")
|
| 636 |
+
r = self.ep.reward.step_reward("rollback")
|
| 637 |
+
r += self.ep.reward.progress_reward("error_drop")
|
| 638 |
+
return r, False, {
|
| 639 |
+
"result": "Migration 003 rolled back successfully. All three services recovering."
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
# Wrong rollback — penalise
|
| 643 |
+
r = self.ep.reward.step_reward("rollback", syntax_error=False)
|
| 644 |
+
r += self.ep.reward.ROLLBACK_PENALTY # extra penalty via RewardManager field
|
| 645 |
+
return r, False, {
|
| 646 |
+
"error": (
|
| 647 |
+
f"Rollback of {service} v{version} either unnecessary or incorrect. "
|
| 648 |
+
"Verify the root cause before rolling back."
|
| 649 |
+
)
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
# ------------------------------------------------------------------
|
| 653 |
+
# 8. query_metrics_history
|
| 654 |
+
# ------------------------------------------------------------------
|
| 655 |
+
def _query_metrics_history(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 656 |
+
service = p.get("service", "")
|
| 657 |
+
metric = p.get("metric", "")
|
| 658 |
+
hours_back = int(p.get("hours_back", 6))
|
| 659 |
+
|
| 660 |
+
key = f"{service}:{metric}"
|
| 661 |
+
history = _METRICS_HISTORY.get(key, [])
|
| 662 |
+
|
| 663 |
+
r = self.ep.reward.step_reward("query_metrics_history")
|
| 664 |
+
self.ep.add_memory(f"queried {metric} history for {service}")
|
| 665 |
+
|
| 666 |
+
if history:
|
| 667 |
+
table = "\n".join(
|
| 668 |
+
f" T-{hours_back - i}h: {val}" for i, (_, val) in enumerate(history)
|
| 669 |
+
)
|
| 670 |
+
return r, False, {
|
| 671 |
+
"service": service,
|
| 672 |
+
"metric": metric,
|
| 673 |
+
"history": table,
|
| 674 |
+
"note": f"Spike visible at T-{hours_back - 3}h (correlates with 02:14 UTC deploy)",
|
| 675 |
+
}
|
| 676 |
+
return r, False, {
|
| 677 |
+
"service": service,
|
| 678 |
+
"metric": metric,
|
| 679 |
+
"history": "No historical data for this metric combination.",
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
# ------------------------------------------------------------------
|
| 683 |
+
# 9. ask_senior_sre
|
| 684 |
+
# ------------------------------------------------------------------
|
| 685 |
+
def _ask_senior_sre(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 686 |
+
question = p.get("question", "")
|
| 687 |
+
|
| 688 |
+
hint = _senior_sre_hint(
|
| 689 |
+
self.ep._task_id,
|
| 690 |
+
question,
|
| 691 |
+
self.ep._sre_memory,
|
| 692 |
+
self.ep._step,
|
| 693 |
+
)
|
| 694 |
+
|
| 695 |
+
# 2-step penalty
|
| 696 |
+
r = self.ep.reward.step_reward("ask_senior_sre") * 2
|
| 697 |
+
self.ep.add_memory(f"asked senior SRE: {question[:60]}")
|
| 698 |
+
return r, False, {"hint": hint}
|
| 699 |
+
|
| 700 |
+
# ------------------------------------------------------------------
|
| 701 |
+
# 10. write_incident_report
|
| 702 |
+
# ------------------------------------------------------------------
|
| 703 |
+
def _write_incident_report(self, p: Dict) -> Tuple[float, bool, Any]:
|
| 704 |
+
from app.models import IncidentReport
|
| 705 |
+
|
| 706 |
+
report = IncidentReport(
|
| 707 |
+
root_cause=p.get("root_cause", ""),
|
| 708 |
+
fix_applied=p.get("fix_applied", ""),
|
| 709 |
+
services_affected=p.get("services_affected", []),
|
| 710 |
+
severity_classification=p.get("severity_classification", "P1"),
|
| 711 |
+
)
|
| 712 |
+
self.ep._incident_report = report
|
| 713 |
+
|
| 714 |
+
report_accuracy = self.ep.grader.grade_incident_report(self.ep._task_id, report)
|
| 715 |
+
task_def = {1: 15, 2: 20, 3: 20, 4: 25, 5: 20}
|
| 716 |
+
sla = task_def.get(self.ep._task_id, 20)
|
| 717 |
+
within_sla = self.ep._step <= sla
|
| 718 |
+
|
| 719 |
+
# Check if tests actually passed
|
| 720 |
+
passed, _, _ = self.ep.grader.run(self.ep._task_id)
|
| 721 |
+
no_regressions = passed
|
| 722 |
+
|
| 723 |
+
r = self.ep.reward.step_reward("write_incident_report")
|
| 724 |
+
r += self.ep.reward.terminal_reward(
|
| 725 |
+
tests_passed=passed,
|
| 726 |
+
report_accuracy=report_accuracy,
|
| 727 |
+
fixed_within_sla=within_sla,
|
| 728 |
+
no_regressions=no_regressions,
|
| 729 |
+
task_id=self.ep._task_id,
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
summary = (
|
| 733 |
+
f"Incident {self.ep._incident_id} closed.\n"
|
| 734 |
+
f"Report accuracy: {report_accuracy:.0%}\n"
|
| 735 |
+
f"Tests passed: {passed}\n"
|
| 736 |
+
f"Within SLA: {within_sla}\n"
|
| 737 |
+
f"Normalized score: {self.ep.reward.normalized_score():.3f}"
|
| 738 |
+
)
|
| 739 |
+
|
| 740 |
+
# Update difficulty controller
|
| 741 |
+
self.ep.dc.update(self.ep._task_id, self.ep.reward.normalized_score())
|
| 742 |
+
|
| 743 |
+
return r, True, {"summary": summary, "report_accuracy": report_accuracy}
|
| 744 |
+
|
| 745 |
+
|
| 746 |
+
# ---------------------------------------------------------------------------
|
| 747 |
+
# Helpers
|
| 748 |
+
# ---------------------------------------------------------------------------
|
| 749 |
+
|
| 750 |
+
def _has_syntax_error(line: str) -> bool:
|
| 751 |
+
"""Quick heuristic check for obvious Python syntax mistakes in a single line."""
|
| 752 |
+
stripped = line.strip()
|
| 753 |
+
# Unmatched brackets
|
| 754 |
+
for open_, close_ in [("(", ")"), ("[", "]"), ("{", "}")]:
|
| 755 |
+
if stripped.count(open_) != stripped.count(close_):
|
| 756 |
+
return True
|
| 757 |
+
# Ends with lone colon inside dict/call (not a block statement)
|
| 758 |
+
# Detect obvious incomplete assignments
|
| 759 |
+
if re.search(r"=\s*$", stripped):
|
| 760 |
+
return True
|
| 761 |
+
return False
|
openenv.yaml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: meta-sre
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
OpenEnv environment for training LLM agents to act as Senior SREs.
|
| 5 |
+
Simulates real Meta production incidents across 3 interconnected services
|
| 6 |
+
with 5 difficulty levels, 10 engineering tools, and a self-improving
|
| 7 |
+
difficulty controller (Theme 4: Self-Improvement).
|
| 8 |
+
|
| 9 |
+
author: Meta-SRE Hackathon Team (Bhavya + Anvit)
|
| 10 |
+
license: MIT
|
| 11 |
+
|
| 12 |
+
endpoints:
|
| 13 |
+
base_url: http://localhost:8000
|
| 14 |
+
reset: POST /reset
|
| 15 |
+
step: POST /step
|
| 16 |
+
state: GET /state
|
| 17 |
+
grade: GET /grade
|
| 18 |
+
tools: GET /tools
|
| 19 |
+
|
| 20 |
+
observation_space:
|
| 21 |
+
type: object
|
| 22 |
+
fields:
|
| 23 |
+
- step: integer
|
| 24 |
+
- incident_id: string
|
| 25 |
+
- system_metrics: object # {service: ServiceMetrics}
|
| 26 |
+
- active_alerts: array # List[Alert]
|
| 27 |
+
- open_file: object # FileView | null
|
| 28 |
+
- terminal_output: string
|
| 29 |
+
- git_diff: string # null if no edits yet
|
| 30 |
+
- dependency_graph: object
|
| 31 |
+
- sre_memory: array # agent's working notes
|
| 32 |
+
- budget_remaining: integer # steps before SLA breach
|
| 33 |
+
|
| 34 |
+
action_space:
|
| 35 |
+
type: tool_call
|
| 36 |
+
tools:
|
| 37 |
+
- view_file
|
| 38 |
+
- edit_line
|
| 39 |
+
- run_tests
|
| 40 |
+
- check_dependency
|
| 41 |
+
- read_logs
|
| 42 |
+
- git_blame
|
| 43 |
+
- rollback
|
| 44 |
+
- query_metrics_history
|
| 45 |
+
- ask_senior_sre
|
| 46 |
+
- write_incident_report
|
| 47 |
+
|
| 48 |
+
reward:
|
| 49 |
+
step_penalty: -0.1
|
| 50 |
+
syntax_error_penalty: -0.5
|
| 51 |
+
rollback_penalty: -1.0
|
| 52 |
+
senior_sre_penalty: -0.2
|
| 53 |
+
terminal_tests_pass: +1.0
|
| 54 |
+
terminal_report_max: +0.5
|
| 55 |
+
terminal_sla_bonus: +0.3
|
| 56 |
+
terminal_no_regress: +0.2
|
| 57 |
+
security_patch_bonus: +0.5 # Task 5 only
|
| 58 |
+
max_possible: 3.0
|
| 59 |
+
|
| 60 |
+
tasks:
|
| 61 |
+
- id: 1
|
| 62 |
+
difficulty: easy
|
| 63 |
+
sla_budget: 15
|
| 64 |
+
description: Single service AttributeError — hallucinated dict method
|
| 65 |
+
|
| 66 |
+
- id: 2
|
| 67 |
+
difficulty: medium
|
| 68 |
+
sla_budget: 20
|
| 69 |
+
description: Silent timestamp corruption in CAPI → ROAS degradation
|
| 70 |
+
|
| 71 |
+
- id: 3
|
| 72 |
+
difficulty: medium-hard
|
| 73 |
+
sla_budget: 20
|
| 74 |
+
description: DB connection pool exhaustion under load
|
| 75 |
+
|
| 76 |
+
- id: 4
|
| 77 |
+
difficulty: hard
|
| 78 |
+
sla_budget: 25
|
| 79 |
+
description: Circular FK migration cascading to 3 services (red herrings)
|
| 80 |
+
|
| 81 |
+
- id: 5
|
| 82 |
+
difficulty: hard
|
| 83 |
+
sla_budget: 20
|
| 84 |
+
description: PII data exposure via DEBUG_MODE=True (security incident)
|
| 85 |
+
|
| 86 |
+
self_improvement:
|
| 87 |
+
enabled: true
|
| 88 |
+
controller: DifficultyController
|
| 89 |
+
description: >
|
| 90 |
+
After each episode the DifficultyController analyses which bug categories
|
| 91 |
+
the agent failed on and weights future task selection toward those weaknesses.
|
| 92 |
+
Bug categories: async_bugs, data_corruption, security_bugs,
|
| 93 |
+
cascading_failures, red_herrings.
|
| 94 |
+
|
| 95 |
+
usage_example: |
|
| 96 |
+
import requests
|
| 97 |
+
|
| 98 |
+
BASE = "http://localhost:8000"
|
| 99 |
+
|
| 100 |
+
obs = requests.post(f"{BASE}/reset", json={"task_id": 1}).json()
|
| 101 |
+
done = False
|
| 102 |
+
|
| 103 |
+
while not done:
|
| 104 |
+
action = your_agent.decide(obs) # returns {"tool": ..., "params": ...}
|
| 105 |
+
result = requests.post(f"{BASE}/step", json=action).json()
|
| 106 |
+
obs = result["observation"]
|
| 107 |
+
done = result["done"]
|
| 108 |
+
|
| 109 |
+
score = requests.get(f"{BASE}/grade").json()["normalized_score"]
|
| 110 |
+
print(f"Score: {score:.3f}")
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Meta-SRE – Python dependencies
|
| 2 |
+
|
| 3 |
+
# Core server
|
| 4 |
+
fastapi>=0.110.0
|
| 5 |
+
uvicorn[standard]>=0.29.0
|
| 6 |
+
pydantic>=2.0.0
|
| 7 |
+
|
| 8 |
+
# Training (install separately in Colab — GPU required)
|
| 9 |
+
# unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
|
| 10 |
+
# trl>=0.9.0 # GRPOTrainer + GRPOConfig require >=0.9.0
|
| 11 |
+
# datasets>=2.18.0
|
| 12 |
+
# transformers>=4.39.0
|
| 13 |
+
# accelerate>=0.28.0
|
| 14 |
+
# bitsandbytes>=0.43.0
|
| 15 |
+
# peft>=0.10.0
|
| 16 |
+
|
| 17 |
+
# Data / evaluation
|
| 18 |
+
numpy>=1.26.0
|
| 19 |
+
|
| 20 |
+
matplotlib>=3.8.0
|
| 21 |
+
seaborn>=0.13.0
|
| 22 |
+
|
| 23 |
+
# Dev / testing
|
| 24 |
+
pytest>=8.0.0
|
| 25 |
+
httpx>=0.27.0 # for FastAPI TestClient
|
training/__init__.py
ADDED
|
File without changes
|
training/generator.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 4 – Perfect-Play Bot & JSONL Dataset Generator.
|
| 3 |
+
|
| 4 |
+
Runs all 5 tasks optimally to generate training episodes.
|
| 5 |
+
Outputs: training/dataset/training_data.jsonl
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
python -m training.generator --episodes 40 --output training/dataset/training_data.jsonl
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
import sys, os
|
| 12 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import random
|
| 16 |
+
import argparse
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Any, Dict, List, Tuple
|
| 19 |
+
|
| 20 |
+
from app.engine.manager import EpisodeManager, TASK_DEFINITIONS
|
| 21 |
+
from app.engine.observability import DifficultyController
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Perfect-Play Scripts
|
| 26 |
+
# Each script is a list of (tool, params) tuples that solve the task optimally.
|
| 27 |
+
# Randomise is applied to variable names / details for dataset diversity.
|
| 28 |
+
# ---------------------------------------------------------------------------
|
| 29 |
+
|
| 30 |
+
def _vary(base: str, variants: List[str]) -> str:
|
| 31 |
+
return random.choice([base] + variants)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def perfect_play_task1(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
|
| 35 |
+
"""Hallucinated attribute: ad.get_clicks() → ad.get('clicks', 0)"""
|
| 36 |
+
return [
|
| 37 |
+
("read_logs", {"service": "ad_ranking", "log_level": "ERROR", "last_n_lines": 20}),
|
| 38 |
+
("view_file", {"service": "ad_ranking", "filename": "ranker.py"}),
|
| 39 |
+
("git_blame", {"service": "ad_ranking", "filename": "ranker.py", "line_number": 22}),
|
| 40 |
+
("edit_line", {
|
| 41 |
+
"service": "ad_ranking",
|
| 42 |
+
"filename": "ranker.py",
|
| 43 |
+
"line_number": 22,
|
| 44 |
+
"new_code": _vary(
|
| 45 |
+
" click_rate = ad.get('clicks', 0) / max(ad.get('impressions', 1), 1)",
|
| 46 |
+
[" click_rate = ad['clicks'] / max(ad.get('impressions', 1), 1)"]
|
| 47 |
+
),
|
| 48 |
+
}),
|
| 49 |
+
("run_tests", {"service": "ad_ranking", "suite": "unit"}),
|
| 50 |
+
("write_incident_report", {
|
| 51 |
+
"root_cause": "AttributeError: dict has no attribute get_clicks() — Junior AI generated method call instead of dict accessor",
|
| 52 |
+
"fix_applied": "Replaced ad.get_clicks() with ad.get('clicks', 0) on ranker.py line 22",
|
| 53 |
+
"services_affected": ["ad_ranking"],
|
| 54 |
+
"severity_classification": "P0",
|
| 55 |
+
}),
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def perfect_play_task2(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
|
| 60 |
+
"""Silent timestamp corruption: threshold 1e9 → 1e12"""
|
| 61 |
+
return [
|
| 62 |
+
("read_logs", {"service": "ad_ranking", "log_level": "WARN", "last_n_lines": 20}),
|
| 63 |
+
("check_dependency", {"service_a": "ad_ranking", "service_b": "capi_pipeline"}),
|
| 64 |
+
("query_metrics_history", {"service": "capi_pipeline", "metric": "error_rate", "hours_back": 6}),
|
| 65 |
+
("read_logs", {"service": "capi_pipeline", "log_level": "DEBUG", "last_n_lines": 20}),
|
| 66 |
+
("view_file", {"service": "capi_pipeline", "filename": "transformer.py"}),
|
| 67 |
+
("edit_line", {
|
| 68 |
+
"service": "capi_pipeline",
|
| 69 |
+
"filename": "transformer.py",
|
| 70 |
+
"line_number": 43,
|
| 71 |
+
"new_code": " if ts > 1_000_000_000_000:",
|
| 72 |
+
}),
|
| 73 |
+
("run_tests", {"service": "capi_pipeline", "suite": "integration"}),
|
| 74 |
+
("write_incident_report", {
|
| 75 |
+
"root_cause": "Timestamp normalisation threshold in capi_pipeline/transformer.py was 1e9 instead of 1e12 — unix-second timestamps treated as milliseconds, resulting in events attributed to 1970",
|
| 76 |
+
"fix_applied": "Changed _normalize_timestamp threshold from 1_000_000_000 to 1_000_000_000_000 on transformer.py line 40",
|
| 77 |
+
"services_affected": ["capi_pipeline", "ad_ranking"],
|
| 78 |
+
"severity_classification": "P1",
|
| 79 |
+
}),
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def perfect_play_task3(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
|
| 84 |
+
"""Connection pool exhaustion: add finally: await db_pool.release(conn)"""
|
| 85 |
+
return [
|
| 86 |
+
("read_logs", {"service": "whatsapp_sync", "log_level": "ERROR", "last_n_lines": 20}),
|
| 87 |
+
("query_metrics_history", {"service": "whatsapp_sync", "metric": "request_queue", "hours_back": 4}),
|
| 88 |
+
("view_file", {"service": "whatsapp_sync", "filename": "handler.py"}),
|
| 89 |
+
("git_blame", {"service": "whatsapp_sync", "filename": "handler.py", "line_number": 35}),
|
| 90 |
+
("run_tests", {"service": "whatsapp_sync", "suite": "unit"}),
|
| 91 |
+
("edit_line", {
|
| 92 |
+
"service": "whatsapp_sync",
|
| 93 |
+
"filename": "handler.py",
|
| 94 |
+
"line_number": 35,
|
| 95 |
+
"new_code": " raise",
|
| 96 |
+
}),
|
| 97 |
+
("edit_line", {
|
| 98 |
+
"service": "whatsapp_sync",
|
| 99 |
+
"filename": "handler.py",
|
| 100 |
+
"line_number": 36,
|
| 101 |
+
"new_code": " finally:",
|
| 102 |
+
}),
|
| 103 |
+
("edit_line", {
|
| 104 |
+
"service": "whatsapp_sync",
|
| 105 |
+
"filename": "handler.py",
|
| 106 |
+
"line_number": 37,
|
| 107 |
+
"new_code": " await self.db_pool.release(conn)",
|
| 108 |
+
}),
|
| 109 |
+
("run_tests", {"service": "whatsapp_sync", "suite": "load"}),
|
| 110 |
+
("write_incident_report", {
|
| 111 |
+
"root_cause": "DB connection pool exhaustion in whatsapp_sync — sync_user_messages() acquires a connection but has no finally block to release it on exception, causing pool depletion under concurrent load",
|
| 112 |
+
"fix_applied": "Added finally: await self.db_pool.release(conn) to sync_user_messages() in handler.py",
|
| 113 |
+
"services_affected": ["whatsapp_sync"],
|
| 114 |
+
"severity_classification": "P1",
|
| 115 |
+
}),
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def perfect_play_task4(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
|
| 120 |
+
"""Circular FK in migration 003 cascading to all services"""
|
| 121 |
+
return [
|
| 122 |
+
("read_logs", {"service": "whatsapp_sync", "log_level": "ERROR", "last_n_lines": 30}),
|
| 123 |
+
("query_metrics_history", {"service": "capi_pipeline", "metric": "p99_latency_ms", "hours_back": 6}),
|
| 124 |
+
("view_file", {"service": "whatsapp_sync", "filename": "db.py"}),
|
| 125 |
+
("git_blame", {"service": "whatsapp_sync", "filename": "db.py", "line_number": 45}),
|
| 126 |
+
("run_tests", {"service": "whatsapp_sync", "suite": "unit"}),
|
| 127 |
+
("rollback", {"service": "whatsapp_sync", "version": "003"}),
|
| 128 |
+
("run_tests", {"service": "whatsapp_sync", "suite": "integration"}),
|
| 129 |
+
("write_incident_report", {
|
| 130 |
+
"root_cause": "Circular foreign key in migration 003: message_threads.parent_message_id references messages, and the ALTER TABLE added messages.thread_id referencing message_threads — PostgreSQL FK resolution failure cascaded to all DB pool consumers",
|
| 131 |
+
"fix_applied": "Rolled back migration 003 to remove circular FK constraint",
|
| 132 |
+
"services_affected": ["whatsapp_sync"],
|
| 133 |
+
"severity_classification": "P0",
|
| 134 |
+
}),
|
| 135 |
+
]
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def perfect_play_task5(ep: EpisodeManager) -> List[Tuple[str, Dict]]:
|
| 139 |
+
"""PII data leak: DEBUG_MODE = True → False"""
|
| 140 |
+
return [
|
| 141 |
+
("read_logs", {"service": "capi_pipeline", "log_level": "DEBUG", "last_n_lines": 20}),
|
| 142 |
+
("run_tests", {"service": "capi_pipeline", "suite": "unit"}),
|
| 143 |
+
("view_file", {"service": "capi_pipeline", "filename": "ingestor.py"}),
|
| 144 |
+
("git_blame", {"service": "capi_pipeline", "filename": "ingestor.py", "line_number": 7}),
|
| 145 |
+
("edit_line", {
|
| 146 |
+
"service": "capi_pipeline",
|
| 147 |
+
"filename": "ingestor.py",
|
| 148 |
+
"line_number": 7,
|
| 149 |
+
"new_code": "DEBUG_MODE = False # FIXED: must be False in production",
|
| 150 |
+
}),
|
| 151 |
+
("run_tests", {"service": "capi_pipeline", "suite": "security"}),
|
| 152 |
+
("write_incident_report", {
|
| 153 |
+
"root_cause": "PII data exposure: DEBUG_MODE=True in production caused /ingest to return raw user PII (emails, phone numbers) in HTTP response body — invisible to unit tests, caught by security suite",
|
| 154 |
+
"fix_applied": "Set DEBUG_MODE = False in capi_pipeline/ingestor.py line 7",
|
| 155 |
+
"services_affected": ["capi_pipeline"],
|
| 156 |
+
"severity_classification": "P0",
|
| 157 |
+
}),
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
PERFECT_PLAY_SCRIPTS = {
|
| 162 |
+
1: perfect_play_task1,
|
| 163 |
+
2: perfect_play_task2,
|
| 164 |
+
3: perfect_play_task3,
|
| 165 |
+
4: perfect_play_task4,
|
| 166 |
+
5: perfect_play_task5,
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ---------------------------------------------------------------------------
|
| 171 |
+
# Observation → prompt string formatter
|
| 172 |
+
# ---------------------------------------------------------------------------
|
| 173 |
+
|
| 174 |
+
def obs_to_prompt(obs: dict) -> str:
|
| 175 |
+
"""Format the observation dict as the LLM system+user prompt."""
|
| 176 |
+
metrics_summary = []
|
| 177 |
+
for svc, m in obs.get("system_metrics", {}).items():
|
| 178 |
+
if isinstance(m, dict):
|
| 179 |
+
metrics_summary.append(
|
| 180 |
+
f" {svc}: CPU={m.get('cpu_percent',0):.0f}% "
|
| 181 |
+
f"MEM={m.get('memory_mb',0):.0f}MB "
|
| 182 |
+
f"ERR={m.get('error_rate',0):.1f}/s "
|
| 183 |
+
f"STATUS={m.get('status','?')}"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
alerts_summary = []
|
| 187 |
+
for a in obs.get("active_alerts", []):
|
| 188 |
+
if isinstance(a, dict):
|
| 189 |
+
alerts_summary.append(
|
| 190 |
+
f" [{a.get('severity','?')}] {a.get('service','?')}: {a.get('message','')}"
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
return (
|
| 194 |
+
f"INCIDENT: {obs.get('incident_id','')}\n"
|
| 195 |
+
f"TASK: {obs.get('task_description','')}\n"
|
| 196 |
+
f"STEP: {obs.get('step',0)} | BUDGET: {obs.get('budget_remaining',0)} steps remaining\n\n"
|
| 197 |
+
f"SYSTEM METRICS:\n" + "\n".join(metrics_summary) + "\n\n"
|
| 198 |
+
f"ACTIVE ALERTS:\n" + ("\n".join(alerts_summary) or " None") + "\n\n"
|
| 199 |
+
f"TERMINAL:\n{obs.get('terminal_output','')}\n\n"
|
| 200 |
+
f"SRE MEMORY:\n" + ("\n".join(f" {m}" for m in obs.get("sre_memory", [])) or " (empty)") + "\n"
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def action_to_response(tool: str, params: Dict) -> str:
|
| 205 |
+
"""Format agent action as the assistant turn in the conversation."""
|
| 206 |
+
return json.dumps({"tool": tool, "params": params}, indent=2)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ---------------------------------------------------------------------------
|
| 210 |
+
# Episode runner
|
| 211 |
+
# ---------------------------------------------------------------------------
|
| 212 |
+
|
| 213 |
+
def run_episode(task_id: int, ep: EpisodeManager) -> List[Dict]:
|
| 214 |
+
"""Run one perfect-play episode. Returns conversation turns for JSONL."""
|
| 215 |
+
obs = ep.reset(task_id=task_id)
|
| 216 |
+
script_fn = PERFECT_PLAY_SCRIPTS[task_id]
|
| 217 |
+
actions = script_fn(ep)
|
| 218 |
+
|
| 219 |
+
turns = []
|
| 220 |
+
obs_dict = obs.model_dump()
|
| 221 |
+
|
| 222 |
+
system_prompt = (
|
| 223 |
+
"You are a Senior Site Reliability Engineer (SRE) at Meta. "
|
| 224 |
+
"You are debugging a live production incident. "
|
| 225 |
+
"Use the available tools methodically: read logs first, then inspect code, "
|
| 226 |
+
"make surgical single-line edits, verify with tests, and close with an incident report. "
|
| 227 |
+
"Never rewrite entire files. Always run tests after editing."
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Initial observation as first user turn
|
| 231 |
+
turns.append({
|
| 232 |
+
"role": "system",
|
| 233 |
+
"content": system_prompt,
|
| 234 |
+
})
|
| 235 |
+
turns.append({
|
| 236 |
+
"role": "user",
|
| 237 |
+
"content": obs_to_prompt(obs_dict),
|
| 238 |
+
})
|
| 239 |
+
|
| 240 |
+
for tool, params in actions:
|
| 241 |
+
# Assistant decides action
|
| 242 |
+
turns.append({
|
| 243 |
+
"role": "assistant",
|
| 244 |
+
"content": action_to_response(tool, params),
|
| 245 |
+
})
|
| 246 |
+
|
| 247 |
+
# Execute in environment
|
| 248 |
+
try:
|
| 249 |
+
result = ep.step(tool=tool, params=params)
|
| 250 |
+
obs_dict = result.observation.model_dump()
|
| 251 |
+
|
| 252 |
+
# Next user turn = new observation
|
| 253 |
+
turns.append({
|
| 254 |
+
"role": "user",
|
| 255 |
+
"content": obs_to_prompt(obs_dict),
|
| 256 |
+
})
|
| 257 |
+
|
| 258 |
+
if result.done:
|
| 259 |
+
break
|
| 260 |
+
except RuntimeError:
|
| 261 |
+
break
|
| 262 |
+
|
| 263 |
+
return turns
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# ---------------------------------------------------------------------------
|
| 267 |
+
# Dataset generator
|
| 268 |
+
# ---------------------------------------------------------------------------
|
| 269 |
+
|
| 270 |
+
def generate_dataset(
|
| 271 |
+
episodes_per_task: int = 40,
|
| 272 |
+
output_path: str = "training/dataset/training_data.jsonl",
|
| 273 |
+
seed: int = 42,
|
| 274 |
+
) -> None:
|
| 275 |
+
random.seed(seed)
|
| 276 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
| 277 |
+
|
| 278 |
+
ep = EpisodeManager(difficulty_controller=DifficultyController())
|
| 279 |
+
total = 0
|
| 280 |
+
task_counts = {t: 0 for t in range(1, 6)}
|
| 281 |
+
|
| 282 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 283 |
+
for episode_idx in range(episodes_per_task * 5):
|
| 284 |
+
task_id = (episode_idx % 5) + 1
|
| 285 |
+
|
| 286 |
+
try:
|
| 287 |
+
turns = run_episode(task_id, ep)
|
| 288 |
+
result = ep.get_episode_result()
|
| 289 |
+
|
| 290 |
+
record = {
|
| 291 |
+
"episode_id": f"ep_{episode_idx:04d}",
|
| 292 |
+
"task_id": task_id,
|
| 293 |
+
"normalized_score": result.normalized_score,
|
| 294 |
+
"steps_taken": result.steps_taken,
|
| 295 |
+
"messages": turns,
|
| 296 |
+
}
|
| 297 |
+
f.write(json.dumps(record) + "\n")
|
| 298 |
+
total += 1
|
| 299 |
+
task_counts[task_id] += 1
|
| 300 |
+
|
| 301 |
+
if episode_idx % 10 == 0:
|
| 302 |
+
print(
|
| 303 |
+
f"[{episode_idx:4d}/{episodes_per_task*5}] "
|
| 304 |
+
f"task={task_id} score={result.normalized_score:.3f} "
|
| 305 |
+
f"steps={result.steps_taken}"
|
| 306 |
+
)
|
| 307 |
+
except Exception as e:
|
| 308 |
+
print(f"WARNING: episode {episode_idx} task {task_id} failed: {e}")
|
| 309 |
+
|
| 310 |
+
print(f"\nDataset written to {output_path}")
|
| 311 |
+
print(f"Total episodes: {total}")
|
| 312 |
+
for t, c in task_counts.items():
|
| 313 |
+
print(f" Task {t}: {c} episodes")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
# ---------------------------------------------------------------------------
|
| 317 |
+
# Baseline evaluator (for "before training" comparison)
|
| 318 |
+
# ---------------------------------------------------------------------------
|
| 319 |
+
|
| 320 |
+
def run_baseline_naive(task_id: int) -> float:
|
| 321 |
+
"""
|
| 322 |
+
Simulate a naive LLM that immediately tries to rewrite a whole file.
|
| 323 |
+
Returns normalized score (expected ~0.18).
|
| 324 |
+
"""
|
| 325 |
+
ep = EpisodeManager()
|
| 326 |
+
ep.reset(task_id=task_id)
|
| 327 |
+
|
| 328 |
+
# Naive agent: immediately tries to edit line 1 with garbage
|
| 329 |
+
ep.step("edit_line", {
|
| 330 |
+
"service": "ad_ranking",
|
| 331 |
+
"filename": "ranker.py",
|
| 332 |
+
"line_number": 1,
|
| 333 |
+
"new_code": "# rewriting entire file... (hallucination)",
|
| 334 |
+
})
|
| 335 |
+
# Then writes incident report without fixing anything
|
| 336 |
+
ep.step("write_incident_report", {
|
| 337 |
+
"root_cause": "unknown error in the code",
|
| 338 |
+
"fix_applied": "rewrote the file",
|
| 339 |
+
"services_affected": ["ad_ranking"],
|
| 340 |
+
"severity_classification": "P1",
|
| 341 |
+
})
|
| 342 |
+
return ep.reward.normalized_score()
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
def evaluate_model(
|
| 346 |
+
model_name: str,
|
| 347 |
+
call_fn, # callable(prompt: str) -> str (returns JSON action)
|
| 348 |
+
n_tasks: int = 5,
|
| 349 |
+
) -> Dict[str, Any]:
|
| 350 |
+
"""
|
| 351 |
+
Evaluate any model against the environment.
|
| 352 |
+
call_fn receives the obs prompt string, returns a JSON string with {tool, params}.
|
| 353 |
+
"""
|
| 354 |
+
import json as _json
|
| 355 |
+
ep = EpisodeManager()
|
| 356 |
+
scores = {}
|
| 357 |
+
|
| 358 |
+
for task_id in range(1, n_tasks + 1):
|
| 359 |
+
obs = ep.reset(task_id=task_id)
|
| 360 |
+
done = False
|
| 361 |
+
while not done and ep._step < 30:
|
| 362 |
+
prompt = obs_to_prompt(obs.dict())
|
| 363 |
+
try:
|
| 364 |
+
response = call_fn(prompt)
|
| 365 |
+
action = _json.loads(response)
|
| 366 |
+
result = ep.step(action["tool"], action.get("params", {}))
|
| 367 |
+
obs = result.observation
|
| 368 |
+
done = result.done
|
| 369 |
+
except Exception as e:
|
| 370 |
+
print(f"Model error on task {task_id}: {e}")
|
| 371 |
+
break
|
| 372 |
+
scores[f"task_{task_id}"] = ep.reward.normalized_score()
|
| 373 |
+
|
| 374 |
+
avg = sum(scores.values()) / len(scores)
|
| 375 |
+
scores["average"] = round(avg, 4)
|
| 376 |
+
print(f"\n{model_name} evaluation results:")
|
| 377 |
+
for k, v in scores.items():
|
| 378 |
+
print(f" {k}: {v:.3f}")
|
| 379 |
+
return scores
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
# ---------------------------------------------------------------------------
|
| 383 |
+
# CLI
|
| 384 |
+
# ---------------------------------------------------------------------------
|
| 385 |
+
|
| 386 |
+
if __name__ == "__main__":
|
| 387 |
+
parser = argparse.ArgumentParser(description="Meta-SRE dataset generator")
|
| 388 |
+
parser.add_argument("--episodes", type=int, default=40,
|
| 389 |
+
help="Episodes per task (default: 40 → 200 total)")
|
| 390 |
+
parser.add_argument("--output", type=str,
|
| 391 |
+
default="training/dataset/training_data.jsonl")
|
| 392 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 393 |
+
args = parser.parse_args()
|
| 394 |
+
|
| 395 |
+
generate_dataset(
|
| 396 |
+
episodes_per_task=args.episodes,
|
| 397 |
+
output_path=args.output,
|
| 398 |
+
seed=args.seed,
|
| 399 |
+
)
|