Veer15 commited on
Commit
b641d3d
·
verified ·
1 Parent(s): 80ee7f5

chore: deploy distributed systems debug environment

Browse files
.gitignore ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ docs/
2
+
3
+ # Python
4
+ __pycache__/
5
+ *.py[cod]
6
+ *.pyo
7
+ .venv/
8
+ venv/
9
+ .env
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+
14
+ # Bun / Node
15
+ node_modules/
16
+ bun.lockb
17
+
18
+ # SQLite
19
+ *.db
20
+ *.sqlite
21
+ *.sqlite3
22
+
23
+ # Logs
24
+ *.log
25
+ /tmp/
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # IDE
32
+ .cursor/
33
+ .vscode/
34
+ *.swp
35
+
36
+ inference.out
.pytest_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Created by pytest automatically.
2
+ *
.pytest_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
.pytest_cache/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
.pytest_cache/v/cache/nodeids ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "tests/test_api_reset.py::test_reset_defaults_to_cascading_timeout_when_task_missing",
3
+ "tests/test_api_reset.py::test_reset_rejects_unknown_explicit_task",
4
+ "tests/test_env_task_context.py::test_job_generator_rate_resolved_uses_config",
5
+ "tests/test_env_task_context.py::test_registry_auth_matches_default_detects_corruption",
6
+ "tests/test_env_task_context.py::test_restore_defaults_adds_job_generator_config",
7
+ "tests/test_graders.py::test_grade_backpressure_cascade_continuous",
8
+ "tests/test_graders.py::test_grade_byzantine_queue_fault_cases",
9
+ "tests/test_graders.py::test_grade_cascading_timeout_boundaries",
10
+ "tests/test_graders.py::test_grade_distributed_lock_starvation_cases",
11
+ "tests/test_graders.py::test_grade_job_generator_runaway_thresholds",
12
+ "tests/test_graders.py::test_grade_registry_corruption_thresholds",
13
+ "tests/test_graders.py::test_grade_route_partition_threshold",
14
+ "tests/test_inference_format.py::test_attempt_history_block_renders_all_attempts",
15
+ "tests/test_inference_format.py::test_build_prompt_contains_symptoms_and_history",
16
+ "tests/test_inference_format.py::test_end_log_line_includes_score_and_reward_list",
17
+ "tests/test_inference_format.py::test_episode_score_clamps_terminal_reward_to_unit_interval",
18
+ "tests/test_inference_format.py::test_extract_command_prefers_first_json_object_with_command",
19
+ "tests/test_inference_format.py::test_extract_command_reads_fenced_json_payload",
20
+ "tests/test_inference_format.py::test_extract_command_reads_json_after_reasoning_preamble",
21
+ "tests/test_inference_format.py::test_extract_command_reads_json_embedded_in_text",
22
+ "tests/test_inference_format.py::test_extract_command_reads_json_payload",
23
+ "tests/test_inference_format.py::test_extract_command_rejects_non_json_code_fence",
24
+ "tests/test_inference_format.py::test_extract_command_requires_command_even_with_reasoning",
25
+ "tests/test_inference_format.py::test_extract_command_returns_none_when_empty",
26
+ "tests/test_inference_format.py::test_extract_reasoning_when_present",
27
+ "tests/test_inference_format.py::test_parse_tasks_default_and_override",
28
+ "tests/test_inference_format.py::test_single_line_removes_newlines",
29
+ "tests/test_inference_format.py::test_task_symptom_block_includes_new_tasks",
30
+ "tests/test_inference_format.py::test_task_symptom_block_is_non_empty",
31
+ "tests/test_models.py::test_action_rejects_empty_command",
32
+ "tests/test_models.py::test_observation_roundtrip",
33
+ "tests/test_models.py::test_system_metrics_rejects_success_rate_above_one"
34
+ ]
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ curl \
5
+ redis-server \
6
+ jq \
7
+ procps \
8
+ sqlite3 \
9
+ unzip \
10
+ ca-certificates \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ RUN useradd -m -u 1000 user
14
+ WORKDIR /home/user/app
15
+ RUN mkdir -p /mesh && chown -R user:user /home/user /mesh
16
+ USER user
17
+
18
+ ENV HOME=/home/user
19
+ ENV PATH="/home/user/.bun/bin:/home/user/.local/bin:${PATH}"
20
+
21
+ RUN curl -fsSL https://bun.sh/install | bash
22
+
23
+ COPY --chown=user:user mesh/ ./mesh/
24
+ RUN cd mesh/gateway && bun install
25
+ RUN cd mesh/auth && bun install
26
+ RUN cd mesh/worker && bun install
27
+
28
+ COPY --chown=user:user requirements.txt ./
29
+ RUN pip install --no-cache-dir -r requirements.txt
30
+
31
+ COPY --chown=user:user *.py ./
32
+ COPY --chown=user:user server/ ./server/
33
+ COPY --chown=user:user openenv.yaml ./
34
+ COPY --chown=user:user start.sh ./
35
+ RUN chmod +x ./start.sh
36
+
37
+ RUN ln -sfn /home/user/app/mesh /mesh
38
+
39
+ EXPOSE 8000
40
+ CMD ["./start.sh"]
README.md CHANGED
@@ -1,9 +1,244 @@
1
- ---
2
- title: openenv-distributed-systems-debugging
3
- sdk: docker
4
- short_description: OpenEnv RL env for debugging distributed systems failures.
5
- ---
6
 
7
- # openenv-distributed-systems-debugging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- OpenEnv-compatible RL environment for debugging distributed systems failures.
 
1
+ # Distributed Systems Debug Environment
 
 
 
 
2
 
3
+ ## Overview
4
+ This project provides an OpenEnv-compatible RL environment for debugging distributed systems failures.
5
+
6
+ The environment simulates a production-style pipeline:
7
+
8
+ - Gateway service (sync HTTP orchestration)
9
+ - Auth service (sync dependency)
10
+ - Redis queue (message bus)
11
+ - Worker service (async consumer + lock handling)
12
+ - SQLite sink (persistence simulation)
13
+
14
+ An agent interacts only through shell commands and must diagnose/fix injected faults.
15
+
16
+ ## Why this environment
17
+ Most RL environments focus on games or synthetic workflows. This one targets some bugs that I have faced personally at my job focussing on debugging skills used in real systems engineering:
18
+
19
+ - reading logs under uncertainty
20
+ - triaging latency and queue symptoms
21
+ - fixing misconfigurations safely
22
+ - validating recovery from metrics
23
+
24
+ ## Architecture
25
+ ```
26
+ Agent command -> /step (FastAPI)
27
+ |
28
+ +-> executes shell command (sandboxed, 10s timeout)
29
+ +-> polls metrics
30
+ +-> grades progress
31
+
32
+ Services (same container):
33
+ gateway:3000 -> auth:3001 -> redis:6379 -> worker -> sqlite
34
+ ```
35
+
36
+ ## Observation Space
37
+ | Field | Type | Description |
38
+ |---|---|---|
39
+ | `command_output` | string | stdout+stderr of last command |
40
+ | `metrics.gateway_success_rate` | float [0,1] | rolling gateway success rate |
41
+ | `metrics.gateway_p99_latency_ms` | float | rolling p99 latency |
42
+ | `metrics.queue_depth` | int | Redis queue depth |
43
+ | `metrics.worker_restart_count` | int | simulated crash-loop count |
44
+ | `metrics.consumer_stall_count` | int | lock-starvation stall count |
45
+ | `process_status` | object | runtime status by service |
46
+
47
+ ## Action Space
48
+ Single command action:
49
+
50
+ ```json
51
+ { "command": "<bash command>" }
52
+ ```
53
+
54
+ Examples:
55
+ - `tail -20 /tmp/worker.log`
56
+ - `redis-cli DEL LOCK:job_processor`
57
+ - `cat /mesh/gateway/blocked_routes.json`
58
+ - `kill -HUP $(cat /tmp/worker.pid)`
59
+
60
+ ## Tasks
61
+ | Task | Difficulty | Goal |
62
+ |---|---|---|
63
+ | `cascading-timeout` | easy | restore successful sync flow (auth delay vs gateway timeout) |
64
+ | `byzantine-queue-fault` | medium | remove poison message and stabilize worker |
65
+ | `distributed-lock-starvation` | hard | clear stale lock and resume consumption |
66
+ | `backpressure-cascade` | hard | recover throughput and reduce queue growth |
67
+ | `route-partition` | hard | unblock gateway->redis route policy |
68
+ | `registry-corruption` | medium | repair corrupted auth registry entry and restore request flow |
69
+ | `job-generator-runaway` | hard | reduce enqueue pressure so the queue drains sustainably |
70
+
71
+ ## Reward Function
72
+ - Terminal reward: `1.0` when grader score >= `0.95`
73
+ - Dense shaping from grader progress + investigation command bonus + metric improvements
74
+ - Penalties for blocked/damaging actions and repeated non-productive behavior
75
+ - Reward clamped to `[0.0, 1.0]`
76
+
77
+ ## Baseline Inference policy (3 of 7 by default)
78
+ All seven tasks are implemented in the environment.
79
+
80
+ `inference.py` runs these default tasks for runtime reliability:
81
+
82
+ 1. `cascading-timeout` (easy)
83
+ 2. `byzantine-queue-fault` (medium)
84
+ 3. `distributed-lock-starvation` (hard)
85
+
86
+ Override with:
87
+
88
+ ```bash
89
+ TASKS_CSV=cascading-timeout,route-partition python inference.py
90
+ ```
91
+
92
+ ## Setup
93
+ ### Local
94
+ ```bash
95
+ python3.12 -m venv .venv
96
+ . .venv/bin/activate
97
+ pip install -r requirements.txt
98
+
99
+ bun install --cwd mesh/gateway
100
+ bun install --cwd mesh/auth
101
+ bun install --cwd mesh/worker
102
+
103
+ APP_ROOT=$(pwd) MESH_ROOT=$(pwd)/mesh ./start.sh
104
+ ```
105
+
106
+ ### Docker
107
+ ```bash
108
+ docker build -t dist-debug-env .
109
+ docker run -p 8000:8000 dist-debug-env
110
+ ```
111
+
112
+ ### API smoke check
113
+ ```bash
114
+ curl http://localhost:8000/health
115
+ curl -X POST "http://localhost:8000/reset?task_name=cascading-timeout"
116
+ curl -X POST http://localhost:8000/step \
117
+ -H "Content-Type: application/json" \
118
+ -d '{"command":"ls /tmp"}'
119
+ ```
120
+
121
+ ## Inference script contract
122
+ `inference.py` emits strict logs:
123
+
124
+ ```text
125
+ [START] task=<task_name> env=<benchmark> model=<model_name>
126
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
127
+ [END] success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...,rn>
128
+ ```
129
+
130
+ ## Logging
131
+ Service logs (JSON-lines):
132
+ - `/tmp/gateway.log`
133
+ - `/tmp/auth.log`
134
+ - `/tmp/worker.log`
135
+
136
+ Common fields:
137
+ - `ts`, `level`, `service`, `event`, `pattern`
138
+
139
+ Example investigation commands:
140
+ ```bash
141
+ tail -30 /tmp/worker.log
142
+ jq 'select(.level=="ERROR")' /tmp/worker.log
143
+ redis-cli LLEN job_queue
144
+ ```
145
+
146
+ ## Baseline scores
147
+ Baseline scores depend on endpoint/model latency and quality. Reproduce with:
148
+
149
+ ```bash
150
+ HF_TOKEN=<token> API_BASE_URL=<endpoint> MODEL_NAME=<model> python inference.py
151
+ ```
152
+
153
+
154
+ ## Run this locally
155
+ Use this checklist when running the full baseline end-to-end on your machine.
156
+
157
+ 1. Install dependencies and validate project setup:
158
+ ```bash
159
+ ./setup-dev.sh
160
+ ```
161
+
162
+ 2. Activate the project virtual environment (required so `uvicorn` and Python deps are on PATH):
163
+ ```bash
164
+ source .venv/bin/activate
165
+ ```
166
+
167
+ 3. Start the environment API (leave this terminal running):
168
+ ```bash
169
+ APP_ROOT=$(pwd) MESH_ROOT=$(pwd)/mesh ./start.sh
170
+ ```
171
+
172
+ 4. In another terminal, activate venv again and export required inference variables:
173
+ ```bash
174
+ source .venv/bin/activate
175
+ export API_BASE_URL="https://openrouter.ai/api/v1"
176
+ export MODEL_NAME="<your-model>"
177
+ export HF_TOKEN="<your-api-key>"
178
+
179
+ # Optional override; default already runs 3 baseline tasks
180
+ export TASKS_CSV="cascading-timeout,byzantine-queue-fault,distributed-lock-starvation"
181
+ ```
182
+
183
+ If you have a .env file you can set the variables from the file via this command
184
+
185
+ ```bash
186
+ set -a
187
+ source .env
188
+ set +a
189
+ ```
190
+
191
+ 5. Run inference with a 20 minute cap and capture output:
192
+ ```bash
193
+ # macOS (coreutils): gtimeout ; Linux: timeout
194
+ gtimeout 1200 python inference.py | tee inference.out
195
+ ```
196
+
197
+ 6. Validate structured stdout format quickly:
198
+ ```bash
199
+ python - <<'PY'
200
+ import re, sys
201
+ from pathlib import Path
202
+
203
+ lines = Path("inference.out").read_text(encoding="utf-8").splitlines()
204
+ if not lines:
205
+ print("FAIL: no output")
206
+ raise SystemExit(1)
207
+
208
+ start_re = re.compile(r'^\[START\] task=\S+ env=\S+ model=.+$')
209
+ step_re = re.compile(r'^\[STEP\]\s{2}step=\d+ action=.* reward=\d+\.\d{2} done=(true|false) error=.*$')
210
+ end_re = re.compile(r'^\[END\]\s{3}success=(true|false) steps=\d+ score=\d+\.\d{2} rewards=.*$')
211
+
212
+ for i, line in enumerate(lines, 1):
213
+ if line.startswith("[START]") and not start_re.match(line):
214
+ print(f"FAIL: bad START line {i}: {line}")
215
+ raise SystemExit(1)
216
+ if line.startswith("[STEP]") and not step_re.match(line):
217
+ print(f"FAIL: bad STEP line {i}: {line}")
218
+ raise SystemExit(1)
219
+ if line.startswith("[END]") and not end_re.match(line):
220
+ print(f"FAIL: bad END line {i}: {line}")
221
+ raise SystemExit(1)
222
+
223
+ print("PASS: stdout format valid")
224
+ PY
225
+ ```
226
+
227
+ 7. Re-run required submission gates:
228
+ ```bash
229
+ openenv validate .
230
+ docker build -t dist-debug-env:local .
231
+ ```
232
+
233
+
234
+
235
+
236
+
237
+ ## Benchmarks b/w Models
238
+
239
+ ### 3 Tasks Benchmark
240
+ <img width="1177" height="752" alt="Screenshot 2026-04-04 at 11 54 25 PM" src="https://github.com/user-attachments/assets/3dbfa87a-6696-4589-a908-baa3f498bda8" />
241
+
242
+ ### 7 Task Benchmark
243
+ <img width="1294" height="240" alt="Screenshot 2026-04-05 at 12 30 45 AM" src="https://github.com/user-attachments/assets/1d0d3847-212e-46ba-967f-f79be3f9067c" />
244
 
 
inference.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Any, Iterator
5
+
6
+ import httpx
7
+ from server.constants import (
8
+ DEFAULT_BASELINE_TASK_ENUMS,
9
+ NO_COMMAND_PROVIDED_SENTINEL,
10
+ TASK_MAX_STEPS,
11
+ TaskName,
12
+ )
13
+ from server.models import Action, Observation, StepResult
14
+
15
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
16
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
17
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
18
+ ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
19
+ BENCHMARK = "distributed-systems-debug-env"
20
+ MAX_STEPS_CAP = int(os.getenv("MAX_STEPS", "0"))
21
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
22
+ MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "2048"))
23
+ _JSON_DECODER = json.JSONDecoder()
24
+
25
+
26
+ def _chat_token_limit_kwargs() -> dict[str, int]:
27
+ """OpenAI `gpt-5.*` / some models require `max_completion_tokens`, not `max_tokens`."""
28
+ override = os.getenv("CHAT_TOKEN_LIMIT_PARAM", "").strip().lower()
29
+ if override == "max_tokens":
30
+ return {"max_tokens": MAX_COMPLETION_TOKENS}
31
+ if override == "max_completion_tokens":
32
+ return {"max_completion_tokens": MAX_COMPLETION_TOKENS}
33
+ base = API_BASE_URL or ""
34
+ if "api.openai.com" in base:
35
+ return {"max_completion_tokens": MAX_COMPLETION_TOKENS}
36
+ return {"max_tokens": MAX_COMPLETION_TOKENS}
37
+
38
+ SYSTEM_PROMPT = """You have bash access to a distributed job processing pipeline that is experiencing a failure.
39
+ Use bash commands to investigate system behavior and narrow down likely fault conditions.
40
+ Standard Unix tools are available: ps, ls, cat, grep, tail, curl, jq, redis-cli, kill, sed.
41
+ Work iteratively across multiple steps; each response must provide the next bash command only.
42
+ Respond with compact JSON where `command` is required: {"command":"<bash command>","reasoning":"optional concise reason"}.
43
+ No markdown. No explanation outside JSON."""
44
+
45
+ TASK_SYMPTOMS: dict[TaskName, tuple[str, ...]] = {
46
+ TaskName.CASCADING_TIMEOUT: (
47
+ "Requests intermittently fail even when services appear up.",
48
+ "Latency spikes sharply during traffic bursts.",
49
+ ),
50
+ TaskName.BYZANTINE_QUEUE_FAULT: (
51
+ "Worker throughput degrades after specific jobs enter the queue.",
52
+ "Queue backlog grows despite workers being alive.",
53
+ ),
54
+ TaskName.DISTRIBUTED_LOCK_STARVATION: (
55
+ "One or more workers appear blocked for extended periods.",
56
+ "Work completion remains low without full service outage.",
57
+ ),
58
+ TaskName.BACKPRESSURE_CASCADE: (
59
+ "Queue depth trends upward over time under steady load.",
60
+ ),
61
+ TaskName.ROUTE_PARTITION: (
62
+ "Gateway requests intermittently fail despite local process health.",
63
+ "Signals point to a connectivity path issue rather than a full service outage.",
64
+ ),
65
+ TaskName.REGISTRY_CORRUPTION: (
66
+ "Gateway requests fail even though the gateway process is still healthy.",
67
+ "Logs and config inspection suggest a bad upstream registry entry.",
68
+ ),
69
+ TaskName.JOB_GENERATOR_RUNAWAY: (
70
+ "Queue backlog grows while the worker stays alive.",
71
+ "Producer pressure appears higher than the system can sustainably drain.",
72
+ ),
73
+ }
74
+
75
+
76
+ class DistributedDebugEnvClient:
77
+ def __init__(self, base_url: str) -> None:
78
+ self._client = httpx.Client(base_url=base_url, timeout=45.0)
79
+
80
+ def close(self) -> None:
81
+ self._client.close()
82
+
83
+ def reset(self, task_name: str) -> Observation:
84
+ response = self._client.post("/reset", params={"task_name": task_name})
85
+ response.raise_for_status()
86
+ return Observation.model_validate(response.json())
87
+
88
+ def step(self, action: Action) -> StepResult:
89
+ response = self._client.post("/step", json=action.model_dump())
90
+ response.raise_for_status()
91
+ return StepResult.model_validate(response.json())
92
+
93
+
94
+ def _parse_tasks() -> list[TaskName]:
95
+ csv = os.getenv("TASKS_CSV", "").strip()
96
+ if not csv:
97
+ return list(DEFAULT_BASELINE_TASK_ENUMS)
98
+
99
+ tasks: list[TaskName] = []
100
+ for value in csv.split(","):
101
+ task_str = value.strip()
102
+ if not task_str:
103
+ continue
104
+ tasks.append(TaskName.parse(task_str))
105
+
106
+ return tasks
107
+
108
+
109
+ def _bool(value: bool) -> str:
110
+ return "true" if value else "false"
111
+
112
+
113
+ def _single_line(text: str) -> str:
114
+ return " ".join(text.replace("\t", " ").splitlines()).strip()
115
+
116
+
117
+ def _command_from_dict(payload: dict[str, Any]) -> tuple[str | None, str | None]:
118
+ command_value = payload.get("command")
119
+ command = command_value.strip() if isinstance(command_value, str) else ""
120
+ if not command:
121
+ return None, None
122
+
123
+ reasoning_value = payload.get("reasoning")
124
+ reasoning = reasoning_value.strip() if isinstance(reasoning_value, str) else ""
125
+ return command, (reasoning or None)
126
+
127
+
128
+ def _parse_action_payload(text: str) -> tuple[str | None, str | None]:
129
+ try:
130
+ payload = json.loads(text)
131
+ except json.JSONDecodeError:
132
+ return None, None
133
+
134
+ if not isinstance(payload, dict):
135
+ return None, None
136
+
137
+ return _command_from_dict(payload)
138
+
139
+
140
+ def _iter_decoded_json_objects(text: str) -> Iterator[Any]:
141
+ i = 0
142
+ while i < len(text):
143
+ if text[i] != "{":
144
+ i += 1
145
+ continue
146
+ try:
147
+ obj, end = _JSON_DECODER.raw_decode(text, i)
148
+ except json.JSONDecodeError:
149
+ i += 1
150
+ continue
151
+ yield obj
152
+ i = end
153
+
154
+
155
+ def _assistant_message_text(message: Any) -> str:
156
+ content = getattr(message, "content", None)
157
+ if content is None:
158
+ return ""
159
+ if isinstance(content, str):
160
+ return content
161
+ if isinstance(content, list):
162
+ parts: list[str] = []
163
+ for part in content:
164
+ if isinstance(part, dict):
165
+ text_val = part.get("text")
166
+ if text_val is not None:
167
+ parts.append(str(text_val))
168
+ else:
169
+ text_attr = getattr(part, "text", None)
170
+ if text_attr is not None:
171
+ parts.append(str(text_attr))
172
+ return "\n".join(parts)
173
+ return str(content)
174
+
175
+
176
+ def extract_action_payload(llm_response: str) -> tuple[str | None, str | None]:
177
+ response = llm_response.strip()
178
+ if not response:
179
+ return None, None
180
+
181
+ if response.startswith("```"):
182
+ lines = response.split("\n")
183
+ if len(lines) > 2:
184
+ response = "\n".join(lines[1:-1]).strip()
185
+
186
+ direct_command, direct_reasoning = _parse_action_payload(response)
187
+ if direct_command:
188
+ return direct_command, direct_reasoning
189
+
190
+ for obj in _iter_decoded_json_objects(response):
191
+ if isinstance(obj, dict):
192
+ embedded_command, embedded_reasoning = _command_from_dict(obj)
193
+ if embedded_command:
194
+ return embedded_command, embedded_reasoning
195
+
196
+ for match in re.finditer(r"\{[^{}]*\}", response, flags=re.DOTALL):
197
+ embedded_command, embedded_reasoning = _parse_action_payload(match.group(0))
198
+ if embedded_command:
199
+ return embedded_command, embedded_reasoning
200
+
201
+ first_line = response.split("\n")[0].strip()
202
+ return _parse_action_payload(first_line)
203
+
204
+
205
+ def extract_command(llm_response: str) -> str | None:
206
+ return extract_action_payload(llm_response)[0]
207
+
208
+
209
+ def extract_reasoning(llm_response: str) -> str | None:
210
+ return extract_action_payload(llm_response)[1]
211
+
212
+
213
+ def _sanitize_reasoning_for_step(reasoning: str) -> str:
214
+ sanitized = _single_line(reasoning)
215
+ sanitized = sanitized.replace(" reward=", " reward:")
216
+ sanitized = sanitized.replace(" done=", " done:")
217
+ sanitized = sanitized.replace(" error=", " error:")
218
+ return sanitized[:160]
219
+
220
+
221
+ def _format_step_action(command: str, reasoning: str | None) -> str:
222
+ action = _single_line(command)
223
+ if not reasoning:
224
+ return action
225
+
226
+ sanitized_reasoning = _sanitize_reasoning_for_step(reasoning)
227
+ if not sanitized_reasoning:
228
+ return action
229
+ return f"{action} | reasoning={sanitized_reasoning}"
230
+
231
+
232
+ def _episode_score(rewards: list[float]) -> float:
233
+ # Score is terminal task progress signal and must stay normalized for evaluator checks.
234
+ if not rewards:
235
+ return 0.0
236
+ return max(0.0, min(1.0, float(rewards[-1])))
237
+
238
+
239
+ def _format_end_line(
240
+ *, success: bool, steps: int, score: float, rewards: list[float]
241
+ ) -> str:
242
+ rewards_csv = ",".join(f"{reward:.2f}" for reward in rewards)
243
+ return (
244
+ f"[END] success={_bool(success)} steps={steps} "
245
+ f"score={score:.2f} rewards={rewards_csv}"
246
+ )
247
+
248
+ def _task_symptom_block(task_name: TaskName) -> str:
249
+ return "\n".join(f"- {symptom}" for symptom in TASK_SYMPTOMS[task_name])
250
+
251
+
252
+ def _attempt_history_block(attempt_history: list[dict[str, Any]]) -> str:
253
+ if not attempt_history:
254
+ return "- none"
255
+
256
+ lines: list[str] = []
257
+ for attempt in attempt_history:
258
+ command = _single_line(str(attempt["command"]))[:120]
259
+ reasoning = _single_line(str(attempt.get("reasoning") or ""))[:120]
260
+ output_preview = _single_line(str(attempt.get("output") or ""))[:140]
261
+ error = attempt.get("error")
262
+ error_text = _single_line(str(error))[:80] if error else "none"
263
+ line = f"- step {attempt['step']}: command={command}; error={error_text}"
264
+ if reasoning:
265
+ line = f"{line}; reasoning={reasoning}"
266
+ if output_preview:
267
+ line = f"{line}; output={output_preview}"
268
+ lines.append(line)
269
+
270
+ return "\n".join(lines)
271
+
272
+
273
+ def build_prompt(
274
+ obs: Observation,
275
+ step_num: int,
276
+ task_name: TaskName,
277
+ attempt_history: list[dict[str, Any]],
278
+ ) -> str:
279
+ return (
280
+ f"Step {step_num}. Current system state:\n\n"
281
+ "TASK SYMPTOMS:\n"
282
+ f"{_task_symptom_block(task_name)}\n\n"
283
+ "PREVIOUS ATTEMPTS:\n"
284
+ f"{_attempt_history_block(attempt_history)}\n\n"
285
+ "METRICS:\n"
286
+ f"- Gateway success rate: {obs.metrics.gateway_success_rate:.1%}\n"
287
+ f"- Gateway P99 latency: {obs.metrics.gateway_p99_latency_ms:.0f}ms\n"
288
+ f"- Queue depth: {obs.metrics.queue_depth}\n"
289
+ f"- Worker restarts: {obs.metrics.worker_restart_count}\n"
290
+ f"- Consumer stall count: {obs.metrics.consumer_stall_count}\n\n"
291
+ "SERVICE STATUS:\n"
292
+ f"{obs.process_status}\n\n"
293
+ "LATEST COMMAND OUTPUT:\n"
294
+ f"{obs.command_output[:2000]}\n\n"
295
+ "Solve this over multiple steps as needed. For this step, return only the single next bash command.\n"
296
+ 'Respond with compact JSON where command is required: {"command":"<bash command>","reasoning":"optional concise reason"}.'
297
+ )
298
+
299
+
300
+ def _run_episode(
301
+ client: Any, env: DistributedDebugEnvClient, task_name: TaskName
302
+ ) -> None:
303
+ messages: list[dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
304
+ rewards: list[float] = []
305
+
306
+ done = False
307
+ step = 0
308
+ last_error: str | None = None
309
+ attempt_history: list[dict[str, Any]] = []
310
+
311
+ print(
312
+ f"[START] task={task_name.value} env={BENCHMARK} model={MODEL_NAME}", flush=True
313
+ )
314
+
315
+ task_budget = TASK_MAX_STEPS[task_name]
316
+ max_steps = min(task_budget, MAX_STEPS_CAP) if MAX_STEPS_CAP > 0 else task_budget
317
+ try:
318
+ obs = env.reset(task_name=task_name.value)
319
+ while not done and step < max_steps:
320
+ next_step = step + 1
321
+ user_prompt = build_prompt(obs, next_step, task_name, attempt_history)
322
+ messages.append({"role": "user", "content": user_prompt})
323
+
324
+ completion = client.chat.completions.create(
325
+ model=MODEL_NAME,
326
+ messages=messages,
327
+ temperature=TEMPERATURE,
328
+ **_chat_token_limit_kwargs(),
329
+ )
330
+
331
+ raw_response = _assistant_message_text(completion.choices[0].message)
332
+ command, reasoning = extract_action_payload(raw_response)
333
+ if not command:
334
+ messages.append({"role": "assistant", "content": raw_response})
335
+ messages.append(
336
+ {
337
+ "role": "user",
338
+ "content": (
339
+ "No command was provided. Respond with compact JSON where command is required: "
340
+ '{"command":"<bash command>","reasoning":"optional concise reason"}.'
341
+ ),
342
+ }
343
+ )
344
+ command = NO_COMMAND_PROVIDED_SENTINEL
345
+ reasoning = None
346
+ else:
347
+ assistant_payload: dict[str, str] = {"command": command}
348
+ if reasoning:
349
+ assistant_payload["reasoning"] = reasoning
350
+ messages.append(
351
+ {"role": "assistant", "content": json.dumps(assistant_payload)}
352
+ )
353
+
354
+ result = env.step(Action(command=command))
355
+ obs = result.observation
356
+ rewards.append(result.reward)
357
+ done = result.done
358
+
359
+ error_value = result.info.get("error")
360
+ last_error = None if error_value in (None, "", "None") else str(error_value)
361
+ error_field = "null" if last_error is None else _single_line(last_error)
362
+ attempt_history.append(
363
+ {
364
+ "step": next_step,
365
+ "command": command,
366
+ "reasoning": reasoning,
367
+ "output": obs.command_output,
368
+ "error": last_error,
369
+ }
370
+ )
371
+
372
+ print(
373
+ f"[STEP] step={next_step} action={_format_step_action(command, reasoning)} "
374
+ f"reward={result.reward:.2f} done={_bool(done)} error={error_field}",
375
+ flush=True,
376
+ )
377
+ step = next_step
378
+
379
+ except Exception as exc:
380
+ last_error = str(exc)
381
+ print(
382
+ f"[ERROR] task={task_name.value} {type(exc).__name__}: {exc}",
383
+ flush=True,
384
+ )
385
+ finally:
386
+ score = _episode_score(rewards)
387
+ success = bool(done and score >= 0.95)
388
+ print(
389
+ _format_end_line(success=success, steps=step, score=score, rewards=rewards),
390
+ flush=True,
391
+ )
392
+
393
+
394
+ def main() -> None:
395
+ if not API_KEY:
396
+ raise RuntimeError("HF_TOKEN (or API_KEY) must be set")
397
+
398
+ tasks = _parse_tasks()
399
+
400
+ from openai import OpenAI
401
+
402
+ client = OpenAI(
403
+ api_key=API_KEY,
404
+ base_url=API_BASE_URL,
405
+ timeout=30.0,
406
+ max_retries=2,
407
+ )
408
+ env = DistributedDebugEnvClient(base_url=ENV_URL)
409
+
410
+ try:
411
+ for task_name in tasks:
412
+ _run_episode(client, env, task_name)
413
+ finally:
414
+ env.close()
415
+
416
+
417
+ if __name__ == "__main__":
418
+ main()
mesh/auth/bun.lock ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lockfileVersion": 1,
3
+ "configVersion": 1,
4
+ "workspaces": {
5
+ "": {
6
+ "name": "auth",
7
+ "devDependencies": {
8
+ "bun-types": "latest",
9
+ },
10
+ },
11
+ },
12
+ "packages": {
13
+ "@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="],
14
+
15
+ "bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
16
+
17
+ "undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
18
+ }
19
+ }
mesh/auth/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "delay_ms": 200
3
+ }
mesh/auth/index.ts ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { existsSync } from "node:fs";
2
+
3
+ type AuthConfig = {
4
+ delay_ms: number;
5
+ };
6
+
7
+ const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
8
+ const CONFIG_PATH = `${MESH_ROOT}/auth/config.json`;
9
+ const CURRENT_TASK_PATH = "/tmp/current_task";
10
+ const PORT = 3001;
11
+
12
+ let config: AuthConfig;
13
+
14
+ const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
15
+
16
+ const readJson = async <T>(path: string, fallback: T): Promise<T> => {
17
+ try {
18
+ return (await Bun.file(path).json()) as T;
19
+ } catch {
20
+ return fallback;
21
+ }
22
+ };
23
+
24
+ const currentPattern = async (): Promise<string> => {
25
+ if (!existsSync(CURRENT_TASK_PATH)) return "unknown";
26
+ try {
27
+ return (await Bun.file(CURRENT_TASK_PATH).text()).trim() || "unknown";
28
+ } catch {
29
+ return "unknown";
30
+ }
31
+ };
32
+
33
+ const log = async (event: string, level: "INFO" | "WARN" | "ERROR", details: Record<string, unknown> = {}) => {
34
+ console.log(
35
+ JSON.stringify({
36
+ ts: new Date().toISOString(),
37
+ level,
38
+ service: "auth",
39
+ event,
40
+ pattern: await currentPattern(),
41
+ ...details,
42
+ }),
43
+ );
44
+ };
45
+
46
+ const loadConfig = async () => {
47
+ config = await readJson<AuthConfig>(CONFIG_PATH, { delay_ms: 200 });
48
+ };
49
+
50
+ await loadConfig();
51
+
52
+ process.on("SIGHUP", async () => {
53
+ await loadConfig();
54
+ await log("config_reloaded", "INFO", { config });
55
+ });
56
+
57
+ Bun.serve({
58
+ port: PORT,
59
+ async fetch(req) {
60
+ const url = new URL(req.url);
61
+
62
+ if (req.method === "GET" && url.pathname === "/health") {
63
+ return Response.json({ status: "ok", delay_ms: config.delay_ms });
64
+ }
65
+
66
+ if (req.method === "POST" && url.pathname === "/verify") {
67
+ const started = Date.now();
68
+ await log("verify_start", "INFO", { delay_ms: config.delay_ms });
69
+
70
+ await sleep(Math.max(0, config.delay_ms));
71
+
72
+ const elapsed = Date.now() - started;
73
+ await log("verify_complete", "INFO", {
74
+ delay_ms: config.delay_ms,
75
+ elapsed_ms: elapsed,
76
+ });
77
+
78
+ return Response.json({ verified: true, elapsed_ms: elapsed });
79
+ }
80
+
81
+ return new Response("not found", { status: 404 });
82
+ },
83
+ });
mesh/auth/package.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "auth",
3
+ "version": "1.0.0",
4
+ "scripts": {
5
+ "start": "bun run index.ts"
6
+ },
7
+ "devDependencies": {
8
+ "bun-types": "latest"
9
+ }
10
+ }
mesh/auth/tsconfig.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "ESNext",
5
+ "moduleResolution": "Bundler",
6
+ "strict": true,
7
+ "skipLibCheck": true,
8
+ "types": ["bun-types"]
9
+ }
10
+ }
mesh/gateway/blocked_routes.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "blocked": []
3
+ }
mesh/gateway/bun.lock ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lockfileVersion": 1,
3
+ "configVersion": 1,
4
+ "workspaces": {
5
+ "": {
6
+ "name": "gateway",
7
+ "dependencies": {
8
+ "ioredis": "^5.4.1",
9
+ },
10
+ "devDependencies": {
11
+ "bun-types": "latest",
12
+ },
13
+ },
14
+ },
15
+ "packages": {
16
+ "@ioredis/commands": ["@ioredis/commands@1.5.1", "", {}, "sha512-JH8ZL/ywcJyR9MmJ5BNqZllXNZQqQbnVZOqpPQqE1vHiFgAw4NHbvE0FOduNU8IX9babitBT46571OnPTT0Zcw=="],
17
+
18
+ "@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="],
19
+
20
+ "bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
21
+
22
+ "cluster-key-slot": ["cluster-key-slot@1.1.2", "", {}, "sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA=="],
23
+
24
+ "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
25
+
26
+ "denque": ["denque@2.1.0", "", {}, "sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw=="],
27
+
28
+ "ioredis": ["ioredis@5.10.1", "", { "dependencies": { "@ioredis/commands": "1.5.1", "cluster-key-slot": "^1.1.0", "debug": "^4.3.4", "denque": "^2.1.0", "lodash.defaults": "^4.2.0", "lodash.isarguments": "^3.1.0", "redis-errors": "^1.2.0", "redis-parser": "^3.0.0", "standard-as-callback": "^2.1.0" } }, "sha512-HuEDBTI70aYdx1v6U97SbNx9F1+svQKBDo30o0b9fw055LMepzpOOd0Ccg9Q6tbqmBSJaMuY0fB7yw9/vjBYCA=="],
29
+
30
+ "lodash.defaults": ["lodash.defaults@4.2.0", "", {}, "sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ=="],
31
+
32
+ "lodash.isarguments": ["lodash.isarguments@3.1.0", "", {}, "sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg=="],
33
+
34
+ "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
35
+
36
+ "redis-errors": ["redis-errors@1.2.0", "", {}, "sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w=="],
37
+
38
+ "redis-parser": ["redis-parser@3.0.0", "", { "dependencies": { "redis-errors": "^1.0.0" } }, "sha512-DJnGAeenTdpMEH6uAJRK/uiyEIH9WVsUmoLwzudwGJUwZPp80PDBWPHXSAGNPwNvIXAbe7MSUB1zQFugFml66A=="],
39
+
40
+ "standard-as-callback": ["standard-as-callback@2.1.0", "", {}, "sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A=="],
41
+
42
+ "undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
43
+ }
44
+ }
mesh/gateway/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "auth_timeout_ms": 500
3
+ }
mesh/gateway/index.ts ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { existsSync } from "node:fs";
2
+
3
+ import Redis from "ioredis";
4
+
5
+ type GatewayConfig = {
6
+ auth_timeout_ms: number;
7
+ };
8
+
9
+ type Registry = {
10
+ services: {
11
+ auth: { host: string; port: number; protocol: string };
12
+ redis: { host: string; port: number; protocol: string };
13
+ worker: { host: string; port: number | null; protocol: string };
14
+ };
15
+ };
16
+
17
+ type BlockedRoutes = {
18
+ blocked: string[];
19
+ };
20
+
21
+ const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
22
+ const CONFIG_PATH = `${MESH_ROOT}/gateway/config.json`;
23
+ const BLOCKED_ROUTES_PATH = `${MESH_ROOT}/gateway/blocked_routes.json`;
24
+ const REGISTRY_PATH = `${MESH_ROOT}/registry.json`;
25
+ const CURRENT_TASK_PATH = "/tmp/current_task";
26
+ const PORT = 3000;
27
+
28
+ let config: GatewayConfig;
29
+ let registry: Registry;
30
+ let redisClient: Redis;
31
+
32
+ const successWindow: number[] = [];
33
+ const latencyWindow: number[] = [];
34
+ const WINDOW_SIZE = 20;
35
+
36
+ const readJson = async <T>(path: string, fallback: T): Promise<T> => {
37
+ try {
38
+ return (await Bun.file(path).json()) as T;
39
+ } catch {
40
+ return fallback;
41
+ }
42
+ };
43
+
44
+ const currentPattern = async (): Promise<string> => {
45
+ if (!existsSync(CURRENT_TASK_PATH)) return "unknown";
46
+ try {
47
+ return (await Bun.file(CURRENT_TASK_PATH).text()).trim() || "unknown";
48
+ } catch {
49
+ return "unknown";
50
+ }
51
+ };
52
+
53
+ const log = async (event: string, level: "INFO" | "WARN" | "ERROR", details: Record<string, unknown> = {}) => {
54
+ const payload = {
55
+ ts: new Date().toISOString(),
56
+ level,
57
+ service: "gateway",
58
+ event,
59
+ pattern: await currentPattern(),
60
+ ...details,
61
+ };
62
+ console.log(JSON.stringify(payload));
63
+ };
64
+
65
+ const loadRuntimeState = async () => {
66
+ config = await readJson<GatewayConfig>(CONFIG_PATH, { auth_timeout_ms: 500 });
67
+ registry = await readJson<Registry>(REGISTRY_PATH, {
68
+ services: {
69
+ auth: { host: "localhost", port: 3001, protocol: "http" },
70
+ redis: { host: "localhost", port: 6379, protocol: "tcp" },
71
+ worker: { host: "localhost", port: null, protocol: "internal" },
72
+ },
73
+ });
74
+
75
+ if (redisClient) {
76
+ redisClient.disconnect();
77
+ }
78
+
79
+ redisClient = new Redis({
80
+ host: registry.services.redis.host,
81
+ port: registry.services.redis.port,
82
+ maxRetriesPerRequest: 1,
83
+ lazyConnect: false,
84
+ });
85
+ };
86
+
87
+ const fetchWithTimeout = async (
88
+ url: string,
89
+ init: RequestInit,
90
+ timeoutMs: number,
91
+ ): Promise<Response> => {
92
+ const controller = new AbortController();
93
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
94
+ try {
95
+ return await fetch(url, { ...init, signal: controller.signal });
96
+ } finally {
97
+ clearTimeout(timer);
98
+ }
99
+ };
100
+
101
+ const percentile99 = (values: number[]): number => {
102
+ if (!values.length) return 0;
103
+ const sorted = [...values].sort((a, b) => a - b);
104
+ const idx = Math.min(sorted.length - 1, Math.floor(0.99 * sorted.length));
105
+ return sorted[idx];
106
+ };
107
+
108
+ const recordOutcome = (success: boolean, elapsedMs: number) => {
109
+ successWindow.push(success ? 1 : 0);
110
+ latencyWindow.push(elapsedMs);
111
+
112
+ if (successWindow.length > WINDOW_SIZE) {
113
+ successWindow.shift();
114
+ }
115
+ if (latencyWindow.length > WINDOW_SIZE) {
116
+ latencyWindow.shift();
117
+ }
118
+ };
119
+
120
+ const getSuccessRate = (): number => {
121
+ if (!successWindow.length) return 1;
122
+ const successes = successWindow.reduce((acc, v) => acc + v, 0);
123
+ return successes / successWindow.length;
124
+ };
125
+
126
+ await loadRuntimeState();
127
+
128
+ process.on("SIGHUP", async () => {
129
+ await loadRuntimeState();
130
+ await log("config_reloaded", "INFO", { config });
131
+ });
132
+
133
+ Bun.serve({
134
+ port: PORT,
135
+ async fetch(req) {
136
+ const url = new URL(req.url);
137
+
138
+ if (req.method === "GET" && url.pathname === "/health") {
139
+ return Response.json({
140
+ status: "ok",
141
+ success_rate: getSuccessRate(),
142
+ p99_latency_ms: percentile99(latencyWindow),
143
+ });
144
+ }
145
+
146
+ if (req.method === "POST" && url.pathname === "/process") {
147
+ const start = Date.now();
148
+
149
+ try {
150
+ const blockedRoutes = await readJson<BlockedRoutes>(BLOCKED_ROUTES_PATH, { blocked: [] });
151
+ if (blockedRoutes.blocked.includes("gateway->redis")) {
152
+ await log("route_blocked", "ERROR", {
153
+ route: "gateway->redis",
154
+ policy_file: BLOCKED_ROUTES_PATH,
155
+ });
156
+ throw new Error("redis_unreachable");
157
+ }
158
+
159
+ const authUrl = `http://${registry.services.auth.host}:${registry.services.auth.port}/verify`;
160
+ const authResponse = await fetchWithTimeout(
161
+ authUrl,
162
+ {
163
+ method: "POST",
164
+ headers: { "content-type": "application/json" },
165
+ body: JSON.stringify({ token: "allow" }),
166
+ },
167
+ config.auth_timeout_ms,
168
+ );
169
+ if (!authResponse.ok) {
170
+ throw new Error(`auth_status_${authResponse.status}`);
171
+ }
172
+
173
+ const job = JSON.stringify({ id: crypto.randomUUID(), payload: "ok" });
174
+ await redisClient.rpush("job_queue", job);
175
+
176
+ const elapsedMs = Date.now() - start;
177
+ recordOutcome(true, elapsedMs);
178
+ await log("request_complete", "INFO", {
179
+ elapsed_ms: elapsedMs,
180
+ upstream: "auth",
181
+ queue_depth_hint: await redisClient.llen("job_queue"),
182
+ });
183
+
184
+ return Response.json({ ok: true, elapsed_ms: elapsedMs });
185
+ } catch (error) {
186
+ const elapsedMs = Date.now() - start;
187
+ recordOutcome(false, elapsedMs);
188
+
189
+ const reason = error instanceof Error ? error.message : String(error);
190
+ if (reason === "AbortError" || reason.includes("aborted") || reason.includes("timeout")) {
191
+ await log("upstream_timeout", "ERROR", {
192
+ elapsed_ms: elapsedMs,
193
+ upstream: "auth",
194
+ threshold_ms: config.auth_timeout_ms,
195
+ });
196
+ }
197
+
198
+ await log("request_failed", "ERROR", {
199
+ path: "/process",
200
+ status: 500,
201
+ reason,
202
+ });
203
+
204
+ return new Response(JSON.stringify({ error: reason }), {
205
+ status: 500,
206
+ headers: { "content-type": "application/json" },
207
+ });
208
+ }
209
+ }
210
+
211
+ return new Response("not found", { status: 404 });
212
+ },
213
+ });
mesh/gateway/package.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "gateway",
3
+ "version": "1.0.0",
4
+ "scripts": {
5
+ "start": "bun run index.ts"
6
+ },
7
+ "dependencies": {
8
+ "ioredis": "^5.4.1"
9
+ },
10
+ "devDependencies": {
11
+ "bun-types": "latest"
12
+ }
13
+ }
mesh/gateway/tsconfig.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "ESNext",
5
+ "moduleResolution": "Bundler",
6
+ "strict": true,
7
+ "skipLibCheck": true,
8
+ "types": ["bun-types"]
9
+ }
10
+ }
mesh/registry.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "services": {
3
+ "auth": {
4
+ "host": "localhost",
5
+ "port": 3001,
6
+ "protocol": "http"
7
+ },
8
+ "redis": {
9
+ "host": "localhost",
10
+ "port": 6379,
11
+ "protocol": "tcp"
12
+ },
13
+ "worker": {
14
+ "host": "localhost",
15
+ "port": null,
16
+ "protocol": "internal"
17
+ }
18
+ }
19
+ }
mesh/worker/bun.lock ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lockfileVersion": 1,
3
+ "configVersion": 1,
4
+ "workspaces": {
5
+ "": {
6
+ "name": "worker",
7
+ "dependencies": {
8
+ "ioredis": "^5.4.1",
9
+ },
10
+ "devDependencies": {
11
+ "bun-types": "latest",
12
+ },
13
+ },
14
+ },
15
+ "packages": {
16
+ "@ioredis/commands": ["@ioredis/commands@1.5.1", "", {}, "sha512-JH8ZL/ywcJyR9MmJ5BNqZllXNZQqQbnVZOqpPQqE1vHiFgAw4NHbvE0FOduNU8IX9babitBT46571OnPTT0Zcw=="],
17
+
18
+ "@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="],
19
+
20
+ "bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
21
+
22
+ "cluster-key-slot": ["cluster-key-slot@1.1.2", "", {}, "sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA=="],
23
+
24
+ "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
25
+
26
+ "denque": ["denque@2.1.0", "", {}, "sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw=="],
27
+
28
+ "ioredis": ["ioredis@5.10.1", "", { "dependencies": { "@ioredis/commands": "1.5.1", "cluster-key-slot": "^1.1.0", "debug": "^4.3.4", "denque": "^2.1.0", "lodash.defaults": "^4.2.0", "lodash.isarguments": "^3.1.0", "redis-errors": "^1.2.0", "redis-parser": "^3.0.0", "standard-as-callback": "^2.1.0" } }, "sha512-HuEDBTI70aYdx1v6U97SbNx9F1+svQKBDo30o0b9fw055LMepzpOOd0Ccg9Q6tbqmBSJaMuY0fB7yw9/vjBYCA=="],
29
+
30
+ "lodash.defaults": ["lodash.defaults@4.2.0", "", {}, "sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ=="],
31
+
32
+ "lodash.isarguments": ["lodash.isarguments@3.1.0", "", {}, "sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg=="],
33
+
34
+ "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
35
+
36
+ "redis-errors": ["redis-errors@1.2.0", "", {}, "sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w=="],
37
+
38
+ "redis-parser": ["redis-parser@3.0.0", "", { "dependencies": { "redis-errors": "^1.0.0" } }, "sha512-DJnGAeenTdpMEH6uAJRK/uiyEIH9WVsUmoLwzudwGJUwZPp80PDBWPHXSAGNPwNvIXAbe7MSUB1zQFugFml66A=="],
39
+
40
+ "standard-as-callback": ["standard-as-callback@2.1.0", "", {}, "sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A=="],
41
+
42
+ "undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
43
+ }
44
+ }
mesh/worker/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "db_pool_size": 10,
3
+ "db_write_delay_ms": 0
4
+ }
mesh/worker/index.ts ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { existsSync } from "node:fs";
2
+
3
+ import Redis from "ioredis";
4
+ import { Database } from "bun:sqlite";
5
+
6
+ type WorkerConfig = {
7
+ db_pool_size: number;
8
+ db_write_delay_ms: number;
9
+ };
10
+
11
+ const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
12
+ const CONFIG_PATH = `${MESH_ROOT}/worker/config.json`;
13
+ const CURRENT_TASK_PATH = "/tmp/current_task";
14
+
15
+ const LOCK_KEY = "LOCK:job_processor";
16
+ const ENQUEUE_RATE_PER_S = Number(process.env.ENQUEUE_RATE_PER_S || "3.0");
17
+
18
+ let config: WorkerConfig;
19
+ let running = true;
20
+ let backoffMs = 1000;
21
+
22
+ const redis = new Redis({ host: "localhost", port: 6379, maxRetriesPerRequest: 1 });
23
+ const db = new Database("/tmp/worker_jobs.sqlite");
24
+ db.exec(
25
+ "CREATE TABLE IF NOT EXISTS jobs (id TEXT PRIMARY KEY, payload TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP)",
26
+ );
27
+
28
+ const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
29
+
30
+ const readJson = async <T>(path: string, fallback: T): Promise<T> => {
31
+ try {
32
+ return (await Bun.file(path).json()) as T;
33
+ } catch {
34
+ return fallback;
35
+ }
36
+ };
37
+
38
+ const currentPattern = async (): Promise<string> => {
39
+ if (!existsSync(CURRENT_TASK_PATH)) return "unknown";
40
+ try {
41
+ return (await Bun.file(CURRENT_TASK_PATH).text()).trim() || "unknown";
42
+ } catch {
43
+ return "unknown";
44
+ }
45
+ };
46
+
47
+
48
+ const incrementCounter = async (path: string): Promise<number> => {
49
+ let current = 0;
50
+ try {
51
+ current = Number(await Bun.file(path).text()) || 0;
52
+ } catch {
53
+ current = 0;
54
+ }
55
+
56
+ const next = current + 1;
57
+ await Bun.write(path, String(next));
58
+ return next;
59
+ };
60
+
61
+ const log = async (event: string, level: "INFO" | "WARN" | "ERROR", details: Record<string, unknown> = {}) => {
62
+ console.log(
63
+ JSON.stringify({
64
+ ts: new Date().toISOString(),
65
+ level,
66
+ service: "worker",
67
+ event,
68
+ pattern: await currentPattern(),
69
+ ...details,
70
+ }),
71
+ );
72
+ };
73
+
74
+ const loadConfig = async () => {
75
+ config = await readJson<WorkerConfig>(CONFIG_PATH, {
76
+ db_pool_size: 10,
77
+ db_write_delay_ms: 0,
78
+ });
79
+ };
80
+
81
+ const estimatedProcessingRate = (): number => {
82
+ const unitCostMs = Math.max(50, config.db_write_delay_ms + 120);
83
+ return config.db_pool_size * (1000 / unitCostMs);
84
+ };
85
+
86
+ const processLoop = async () => {
87
+ while (running) {
88
+ try {
89
+ const acquired = await redis.set(LOCK_KEY, String(process.pid), "EX", 30, "NX");
90
+ if (!acquired) {
91
+ const stallCount = await incrementCounter("/tmp/consumer_stall_count");
92
+ await log("lock_acquire_failed", "WARN", {
93
+ lock_key: LOCK_KEY,
94
+ stall_count: stallCount,
95
+ });
96
+ await sleep(120);
97
+ continue;
98
+ }
99
+
100
+ const raw = await redis.lpop("job_queue");
101
+ if (!raw) {
102
+ await redis.del(LOCK_KEY);
103
+ await sleep(120);
104
+ continue;
105
+ }
106
+
107
+ let parsed: { id?: string; payload?: unknown };
108
+ try {
109
+ parsed = JSON.parse(raw) as { id?: string; payload?: unknown };
110
+ } catch (error) {
111
+ const restartCount = await incrementCounter("/tmp/worker_restart_count");
112
+ await log("job_dequeued", "INFO", { raw });
113
+ await log("parse_failed", "ERROR", {
114
+ error: error instanceof Error ? error.message : String(error),
115
+ raw,
116
+ });
117
+ await log("consumer_backoff", "WARN", {
118
+ restart_count: restartCount,
119
+ backoff_ms: backoffMs,
120
+ });
121
+
122
+ await redis.lpush("job_queue", raw);
123
+ await redis.del(LOCK_KEY);
124
+ await sleep(backoffMs);
125
+ backoffMs = Math.min(10000, backoffMs * 2);
126
+ continue;
127
+ }
128
+
129
+ const start = Date.now();
130
+ if (config.db_write_delay_ms > 0) {
131
+ await sleep(config.db_write_delay_ms);
132
+ }
133
+
134
+ db.query("INSERT OR REPLACE INTO jobs (id, payload) VALUES (?, ?)").run(
135
+ parsed.id || crypto.randomUUID(),
136
+ JSON.stringify(parsed.payload ?? null),
137
+ );
138
+
139
+ backoffMs = 1000;
140
+ const elapsedMs = Date.now() - start;
141
+ const queueDepth = Number(await redis.llen("job_queue"));
142
+ await log("db_write_complete", "INFO", {
143
+ elapsed_ms: elapsedMs,
144
+ pool_size: config.db_pool_size,
145
+ });
146
+
147
+ const processingRate = estimatedProcessingRate();
148
+ if (processingRate < ENQUEUE_RATE_PER_S || queueDepth > 10) {
149
+ await log("throughput_lag", "WARN", {
150
+ processing_rate_per_s: Number(processingRate.toFixed(2)),
151
+ enqueue_rate_per_s: ENQUEUE_RATE_PER_S,
152
+ queue_depth: queueDepth,
153
+ });
154
+ }
155
+
156
+ await log("job_processed", "INFO", {
157
+ job_id: parsed.id || null,
158
+ queue_depth: queueDepth,
159
+ });
160
+
161
+ await redis.del(LOCK_KEY);
162
+ await sleep(80);
163
+ } catch (error) {
164
+ await log("loop_error", "ERROR", {
165
+ error: error instanceof Error ? error.message : String(error),
166
+ });
167
+ await sleep(250);
168
+ }
169
+ }
170
+ };
171
+
172
+ if (!existsSync("/tmp/worker_restart_count")) await Bun.write("/tmp/worker_restart_count", "0");
173
+ if (!existsSync("/tmp/consumer_stall_count")) await Bun.write("/tmp/consumer_stall_count", "0");
174
+ await Bun.write("/tmp/worker.pid", String(process.pid));
175
+
176
+ await loadConfig();
177
+
178
+ process.on("SIGHUP", async () => {
179
+ await loadConfig();
180
+ await log("config_reloaded", "INFO", { config });
181
+ });
182
+
183
+ process.on("SIGTERM", () => {
184
+ running = false;
185
+ });
186
+
187
+ process.on("SIGINT", () => {
188
+ running = false;
189
+ });
190
+
191
+ await processLoop();
192
+
193
+ try {
194
+ await redis.quit();
195
+ } catch {
196
+ redis.disconnect();
197
+ }
198
+ db.close();
mesh/worker/job_generator.ts ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Redis from "ioredis";
2
+
3
+ type JobGeneratorConfig = {
4
+ interval_ms: number;
5
+ };
6
+
7
+ const redis = new Redis({ host: "localhost", port: 6379, maxRetriesPerRequest: 1 });
8
+ const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
9
+ const CONFIG_PATH = `${MESH_ROOT}/worker/job_generator_config.json`;
10
+
11
+ let running = true;
12
+ let intervalMs = 333;
13
+
14
+ const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
15
+
16
+ const readJson = async <T>(path: string, fallback: T): Promise<T> => {
17
+ try {
18
+ return (await Bun.file(path).json()) as T;
19
+ } catch {
20
+ return fallback;
21
+ }
22
+ };
23
+
24
+ const loadConfig = async () => {
25
+ const config = await readJson<JobGeneratorConfig>(CONFIG_PATH, { interval_ms: 333 });
26
+ intervalMs = Math.max(10, Number(config.interval_ms) || 333);
27
+ };
28
+
29
+ const loop = async () => {
30
+ while (running) {
31
+ const job = JSON.stringify({
32
+ id: crypto.randomUUID(),
33
+ payload: {
34
+ kind: "normal",
35
+ ts: new Date().toISOString(),
36
+ },
37
+ });
38
+
39
+ try {
40
+ await redis.rpush("job_queue", job);
41
+ console.log(
42
+ JSON.stringify({
43
+ ts: new Date().toISOString(),
44
+ level: "INFO",
45
+ service: "job_generator",
46
+ event: "job_enqueued",
47
+ }),
48
+ );
49
+ } catch (error) {
50
+ console.log(
51
+ JSON.stringify({
52
+ ts: new Date().toISOString(),
53
+ level: "ERROR",
54
+ service: "job_generator",
55
+ event: "enqueue_failed",
56
+ error: error instanceof Error ? error.message : String(error),
57
+ }),
58
+ );
59
+ }
60
+
61
+ await sleep(intervalMs);
62
+ }
63
+ };
64
+
65
+ await loadConfig();
66
+
67
+ process.on("SIGHUP", async () => {
68
+ await loadConfig();
69
+ console.log(
70
+ JSON.stringify({
71
+ ts: new Date().toISOString(),
72
+ level: "INFO",
73
+ service: "job_generator",
74
+ event: "config_reloaded",
75
+ interval_ms: intervalMs,
76
+ }),
77
+ );
78
+ });
79
+
80
+ process.on("SIGTERM", () => {
81
+ running = false;
82
+ });
83
+
84
+ process.on("SIGINT", () => {
85
+ running = false;
86
+ });
87
+
88
+ await loop();
89
+
90
+ try {
91
+ await redis.quit();
92
+ } catch {
93
+ redis.disconnect();
94
+ }
mesh/worker/job_generator_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "interval_ms": 10
3
+ }
mesh/worker/package.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "worker",
3
+ "version": "1.0.0",
4
+ "scripts": {
5
+ "start": "bun run index.ts",
6
+ "job-generator": "bun run job_generator.ts"
7
+ },
8
+ "dependencies": {
9
+ "ioredis": "^5.4.1"
10
+ },
11
+ "devDependencies": {
12
+ "bun-types": "latest"
13
+ }
14
+ }
mesh/worker/tsconfig.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "ESNext",
5
+ "moduleResolution": "Bundler",
6
+ "strict": true,
7
+ "skipLibCheck": true,
8
+ "types": ["bun-types"]
9
+ }
10
+ }
openenv.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: distributed-systems-debug-env
2
+ version: "1.0.0"
3
+ description: >
4
+ An RL environment for debugging a distributed job processing pipeline with
5
+ five deterministic fault scenarios: cascading timeout, byzantine queue fault,
6
+ distributed lock starvation, backpressure cascade, and route partition.
7
+ author: Team Rocket
8
+ tags:
9
+ - openenv
10
+ - distributed-systems
11
+ - debugging
12
+ - reinforcement-learning
13
+
14
+ reward_range: [0.0, 1.0]
15
+
16
+ tasks:
17
+ - name: cascading-timeout
18
+ description: Synchronous upstream delay exceeds gateway timeout.
19
+ difficulty: easy
20
+ max_steps: 15
21
+
22
+ - name: byzantine-queue-fault
23
+ description: Poison pill message causes worker parse crash-loop.
24
+ difficulty: medium
25
+ max_steps: 18
26
+
27
+ - name: distributed-lock-starvation
28
+ description: Stale distributed lock prevents queue consumption.
29
+ difficulty: hard
30
+ max_steps: 20
31
+
32
+ - name: backpressure-cascade
33
+ description: Consumer throughput lower than producer enqueue rate.
34
+ difficulty: hard
35
+ max_steps: 20
36
+
37
+ - name: route-partition
38
+ description: Route policy blocks gateway to redis communication.
39
+ difficulty: hard
40
+ max_steps: 20
41
+
42
+ - name: registry-corruption
43
+ description: Gateway reloads a corrupted auth registry entry and fails requests.
44
+ difficulty: medium
45
+ max_steps: 18
46
+
47
+ - name: job-generator-runaway
48
+ description: Runaway enqueue rate overwhelms worker throughput and grows backlog.
49
+ difficulty: hard
50
+ max_steps: 20
51
+
52
+ observation_space:
53
+ type: object
54
+ properties:
55
+ command_output:
56
+ type: string
57
+ metrics:
58
+ type: object
59
+ properties:
60
+ gateway_success_rate:
61
+ type: number
62
+ minimum: 0.0
63
+ maximum: 1.0
64
+ gateway_p99_latency_ms:
65
+ type: number
66
+ minimum: 0.0
67
+ queue_depth:
68
+ type: integer
69
+ minimum: 0
70
+ worker_restart_count:
71
+ type: integer
72
+ minimum: 0
73
+ consumer_stall_count:
74
+ type: integer
75
+ minimum: 0
76
+ process_status:
77
+ type: object
78
+ additionalProperties:
79
+ type: string
80
+
81
+ action_space:
82
+ type: object
83
+ properties:
84
+ command:
85
+ type: string
86
+ description: Single bash command executed in the debug sandbox.
pyproject.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "distributed-systems-debug-env"
3
+ version = "1.0.0"
4
+ description = "OpenEnv distributed systems debugging environment"
5
+ readme = "README.md"
6
+ requires-python = "==3.12.*"
7
+ dependencies = [
8
+ "fastapi==0.115.0",
9
+ "uvicorn[standard]==0.30.0",
10
+ "pydantic>=2.11.0",
11
+ "openenv-core>=0.2.0",
12
+ "openai>=1.30.0",
13
+ "httpx>=0.27.0"
14
+ ]
15
+
16
+ [project.scripts]
17
+ server = "server.app:main"
18
+
19
+
20
+ [tool.pytest.ini_options]
21
+ testpaths = ["tests"]
22
+ pythonpath = ["."]
23
+
24
+ [build-system]
25
+ requires = ["setuptools>=68", "wheel"]
26
+ build-backend = "setuptools.build_meta"
27
+
28
+
29
+ [tool.setuptools.packages.find]
30
+ where = ["."]
31
+ include = ["server*"]
32
+ exclude = ["mesh*", "tests*"]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.0
3
+ pydantic>=2.11.0
4
+ openenv-core>=0.2.0
5
+ openai>=1.30.0
6
+ httpx>=0.27.0
7
+ pytest>=8.0.0
server/__init__.py ADDED
File without changes
server/api.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from contextlib import asynccontextmanager
4
+
5
+ from fastapi import FastAPI, HTTPException
6
+
7
+ from .constants import TaskName
8
+ from .env import DistributedDebugEnv
9
+ from .models import Action, Observation, StepResult
10
+
11
+
12
+ @asynccontextmanager
13
+ async def lifespan(app: FastAPI):
14
+ env = DistributedDebugEnv()
15
+ env.start()
16
+ app.state.env = env
17
+ try:
18
+ yield
19
+ finally:
20
+ env.close()
21
+
22
+
23
+ app = FastAPI(title="Distributed Systems Debug Environment", version="1.0.0", lifespan=lifespan)
24
+
25
+
26
+ @app.post("/reset", response_model=Observation)
27
+ async def reset(task_name: str | None = None) -> Observation:
28
+ # Validator and sample inference call /reset without task input.
29
+ # Use a deterministic default task for reproducible episode bootstrapping.
30
+ selected_task_name = task_name or TaskName.CASCADING_TIMEOUT.value
31
+ try:
32
+ task = TaskName.parse(selected_task_name)
33
+ except ValueError as exc:
34
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
35
+
36
+ try:
37
+ env: DistributedDebugEnv = app.state.env
38
+ return env.reset(task_name=task)
39
+ except Exception as exc:
40
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
41
+
42
+
43
+ @app.post("/step", response_model=StepResult)
44
+ async def step(action: Action) -> StepResult:
45
+ try:
46
+ env: DistributedDebugEnv = app.state.env
47
+ return env.step(action)
48
+ except Exception as exc:
49
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
50
+
51
+
52
+ @app.get("/state")
53
+ async def state() -> dict:
54
+ try:
55
+ env: DistributedDebugEnv = app.state.env
56
+ return env.state()
57
+ except Exception as exc:
58
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
59
+
60
+
61
+ @app.get("/health")
62
+ async def health() -> dict:
63
+ return {"status": "ok", "version": "1.0.0"}
server/app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .api import app
2
+
3
+
4
+ def main() -> None:
5
+ import uvicorn
6
+
7
+ uvicorn.run(app, host="0.0.0.0", port=8000)
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()
server/constants.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+
3
+
4
+ class TaskName(str, Enum):
5
+ CASCADING_TIMEOUT = "cascading-timeout"
6
+ BYZANTINE_QUEUE_FAULT = "byzantine-queue-fault"
7
+ DISTRIBUTED_LOCK_STARVATION = "distributed-lock-starvation"
8
+ BACKPRESSURE_CASCADE = "backpressure-cascade"
9
+ ROUTE_PARTITION = "route-partition"
10
+ REGISTRY_CORRUPTION = "registry-corruption"
11
+ JOB_GENERATOR_RUNAWAY = "job-generator-runaway"
12
+
13
+ @classmethod
14
+ def parse(cls, value: str) -> "TaskName":
15
+ try:
16
+ return cls(value)
17
+ except ValueError as exc:
18
+ raise ValueError(f"Unknown task: {value}") from exc
19
+
20
+
21
+ ALL_TASKS = [task.value for task in TaskName]
22
+
23
+ # Baseline default: easy + medium + hard (deadlock/starvation included).
24
+ DEFAULT_BASELINE_TASKS = [
25
+ TaskName.CASCADING_TIMEOUT.value,
26
+ TaskName.BYZANTINE_QUEUE_FAULT.value,
27
+ TaskName.DISTRIBUTED_LOCK_STARVATION.value,
28
+ ]
29
+
30
+ DEFAULT_BASELINE_TASK_ENUMS = [
31
+ TaskName.CASCADING_TIMEOUT,
32
+ TaskName.BYZANTINE_QUEUE_FAULT,
33
+ TaskName.DISTRIBUTED_LOCK_STARVATION,
34
+ ]
35
+
36
+ NO_COMMAND_PROVIDED_SENTINEL = "__NO_COMMAND_PROVIDED__"
37
+
38
+ TASK_MAX_STEPS = {
39
+ TaskName.CASCADING_TIMEOUT: 15,
40
+ TaskName.BYZANTINE_QUEUE_FAULT: 18,
41
+ TaskName.DISTRIBUTED_LOCK_STARVATION: 20,
42
+ TaskName.BACKPRESSURE_CASCADE: 20,
43
+ TaskName.ROUTE_PARTITION: 20,
44
+ TaskName.REGISTRY_CORRUPTION: 18,
45
+ TaskName.JOB_GENERATOR_RUNAWAY: 20,
46
+ }
47
+
48
+ DEFAULT_CONFIGS = {
49
+ "auth": {"delay_ms": 200},
50
+ "gateway": {"auth_timeout_ms": 500},
51
+ "worker": {"db_pool_size": 10, "db_write_delay_ms": 0},
52
+ "job_generator": {"interval_ms": 333},
53
+ "blocked_routes": {"blocked": []},
54
+ }
server/env.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import subprocess
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from .constants import (
9
+ DEFAULT_CONFIGS,
10
+ NO_COMMAND_PROVIDED_SENTINEL,
11
+ TASK_MAX_STEPS,
12
+ TaskName,
13
+ )
14
+ from .fault_injector import inject_fault
15
+ from .graders import grade_task
16
+ from .metrics_poller import MetricsPoller
17
+ from .models import Action, Observation, StepResult
18
+ from .process_manager import ProcessManager
19
+
20
+
21
+ class DistributedDebugEnv:
22
+ """OpenEnv-compatible distributed systems debugging environment."""
23
+
24
+ def __init__(
25
+ self, project_root: Path | None = None, mesh_root: Path | None = None
26
+ ) -> None:
27
+ self.project_root = (
28
+ project_root or Path(__file__).resolve().parent.parent
29
+ ).resolve()
30
+ self.mesh_root = (
31
+ mesh_root or Path(os.getenv("MESH_ROOT", self.project_root / "mesh"))
32
+ ).resolve()
33
+
34
+ self._process_manager = ProcessManager(
35
+ project_root=self.project_root, mesh_root=self.mesh_root
36
+ )
37
+ self._metrics_poller = MetricsPoller(poll_interval_s=2.0)
38
+
39
+ self.current_task: TaskName | None = None
40
+ self.max_steps: int = 0
41
+ self.step_count: int = 0
42
+ self.last_exit_code: int = 0
43
+ self.prev_observation: Observation | None = None
44
+ self._baselines: dict[str, int] = {
45
+ "baseline_worker_restart_count": 0,
46
+ "baseline_consumer_stall_count": 0,
47
+ }
48
+ self._seen_diagnostic_signatures: set[str] = set()
49
+ self._command_counts: dict[str, int] = {}
50
+ self._last_grader_score: float = 0.0
51
+
52
+ def start(self) -> None:
53
+ if not self._metrics_poller.is_alive():
54
+ self._metrics_poller.start()
55
+
56
+ def close(self) -> None:
57
+ self._metrics_poller.stop()
58
+
59
+ def _write_json(self, path: Path, payload: dict[str, Any]) -> None:
60
+ path.parent.mkdir(parents=True, exist_ok=True)
61
+ path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
62
+
63
+ def _restore_defaults(self) -> None:
64
+ self._write_json(
65
+ self.mesh_root / "registry.json",
66
+ {
67
+ "services": {
68
+ "auth": {"host": "localhost", "port": 3001, "protocol": "http"},
69
+ "redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
70
+ "worker": {
71
+ "host": "localhost",
72
+ "port": None,
73
+ "protocol": "internal",
74
+ },
75
+ }
76
+ },
77
+ )
78
+ self._write_json(
79
+ self.mesh_root / "auth" / "config.json", DEFAULT_CONFIGS["auth"]
80
+ )
81
+ self._write_json(
82
+ self.mesh_root / "gateway" / "config.json", DEFAULT_CONFIGS["gateway"]
83
+ )
84
+ self._write_json(
85
+ self.mesh_root / "gateway" / "blocked_routes.json",
86
+ DEFAULT_CONFIGS["blocked_routes"],
87
+ )
88
+ self._write_json(
89
+ self.mesh_root / "worker" / "config.json", DEFAULT_CONFIGS["worker"]
90
+ )
91
+ self._write_json(
92
+ self.mesh_root / "worker" / "job_generator_config.json",
93
+ DEFAULT_CONFIGS["job_generator"],
94
+ )
95
+
96
+ def _truncate_logs(self) -> None:
97
+ for service in ["gateway", "auth", "worker", "job_gen"]:
98
+ Path(f"/tmp/{service}.log").write_text("", encoding="utf-8")
99
+
100
+ def _reset_runtime_counters(self) -> None:
101
+ Path("/tmp/worker_restart_count").write_text("0", encoding="utf-8")
102
+ Path("/tmp/consumer_stall_count").write_text("0", encoding="utf-8")
103
+
104
+ def _redis_flush(self) -> None:
105
+ subprocess.run(
106
+ ["redis-cli", "FLUSHDB"], check=True, capture_output=True, text=True
107
+ )
108
+
109
+ def _read_float(self, value: str, default: float = 0.0) -> float:
110
+ try:
111
+ return float(value)
112
+ except (TypeError, ValueError):
113
+ return default
114
+
115
+ def _is_route_blocked(self) -> bool:
116
+ blocked_file = self.mesh_root / "gateway" / "blocked_routes.json"
117
+ try:
118
+ payload = json.loads(blocked_file.read_text(encoding="utf-8"))
119
+ blocked = payload.get("blocked", [])
120
+ return "gateway->redis" in blocked
121
+ except Exception:
122
+ return False
123
+
124
+ def _is_lock_present(self) -> bool:
125
+ result = subprocess.run(
126
+ ["redis-cli", "EXISTS", "LOCK:job_processor"],
127
+ capture_output=True,
128
+ text=True,
129
+ timeout=2,
130
+ check=False,
131
+ )
132
+ return result.stdout.strip() == "1"
133
+
134
+ def _is_cascading_timeout_resolved(self) -> bool:
135
+ auth_config_file = self.mesh_root / "auth" / "config.json"
136
+ gateway_config_file = self.mesh_root / "gateway" / "config.json"
137
+ try:
138
+ auth_payload = json.loads(auth_config_file.read_text(encoding="utf-8"))
139
+ gateway_payload = json.loads(
140
+ gateway_config_file.read_text(encoding="utf-8")
141
+ )
142
+ except Exception:
143
+ return False
144
+
145
+ auth_delay_ms = self._read_float(auth_payload.get("delay_ms"), default=0.0)
146
+ auth_timeout_ms = self._read_float(
147
+ gateway_payload.get("auth_timeout_ms"), default=0.0
148
+ )
149
+ if auth_timeout_ms <= 0:
150
+ return False
151
+ return auth_delay_ms <= auth_timeout_ms
152
+
153
+ def _is_registry_auth_default(self) -> bool:
154
+ registry_file = self.mesh_root / "registry.json"
155
+ try:
156
+ payload = json.loads(registry_file.read_text(encoding="utf-8"))
157
+ auth_service = payload["services"]["auth"]
158
+ except Exception:
159
+ return False
160
+
161
+ return (
162
+ auth_service.get("host") == "localhost"
163
+ and int(auth_service.get("port", 0)) == 3001
164
+ and auth_service.get("protocol") == "http"
165
+ )
166
+
167
+ def _job_generator_interval_ms(self) -> int:
168
+ config_file = self.mesh_root / "worker" / "job_generator_config.json"
169
+ try:
170
+ payload = json.loads(config_file.read_text(encoding="utf-8"))
171
+ except Exception:
172
+ return 0
173
+
174
+ try:
175
+ return int(payload.get("interval_ms", 0))
176
+ except (TypeError, ValueError):
177
+ return 0
178
+
179
+ def _is_job_generator_rate_resolved(self) -> bool:
180
+ return self._job_generator_interval_ms() >= int(
181
+ DEFAULT_CONFIGS["job_generator"]["interval_ms"]
182
+ )
183
+
184
+ def _build_grader_context(self) -> dict[str, Any]:
185
+ return {
186
+ **self._baselines,
187
+ "route_blocked": self._is_route_blocked(),
188
+ "lock_exists": self._is_lock_present(),
189
+ "cascading_timeout_resolved": self._is_cascading_timeout_resolved(),
190
+ "registry_auth_matches_default": self._is_registry_auth_default(),
191
+ "job_generator_interval_ms": self._job_generator_interval_ms(),
192
+ "job_generator_rate_resolved": self._is_job_generator_rate_resolved(),
193
+ }
194
+
195
+ def _blocked_command(self, command: str) -> bool:
196
+ dangerous_patterns = [
197
+ "rm -rf /",
198
+ "kill -9 1",
199
+ "pkill -f uvicorn",
200
+ "> /tmp/gateway.log",
201
+ "> /tmp/auth.log",
202
+ "> /tmp/worker.log",
203
+ ]
204
+ normalized = command.strip().lower()
205
+ return any(pattern in normalized for pattern in dangerous_patterns)
206
+
207
+ def _run_command(self, command: str) -> tuple[str, str | None]:
208
+ if command.strip() == NO_COMMAND_PROVIDED_SENTINEL:
209
+ self.last_exit_code = 2
210
+ return (
211
+ "No command provided by model. Expected JSON with a command field.",
212
+ "no_command_provided",
213
+ )
214
+
215
+ if self._blocked_command(command):
216
+ self.last_exit_code = 1
217
+ return (
218
+ "BLOCKED: This command would damage the environment infrastructure.",
219
+ "blocked_command",
220
+ )
221
+
222
+ try:
223
+ result = subprocess.run(
224
+ command,
225
+ shell=True,
226
+ capture_output=True,
227
+ text=True,
228
+ timeout=10,
229
+ cwd="/",
230
+ env={
231
+ **os.environ,
232
+ "PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
233
+ },
234
+ check=False,
235
+ )
236
+ self.last_exit_code = result.returncode
237
+ output = (result.stdout + result.stderr).strip() or "(no output)"
238
+ return output, None
239
+ except subprocess.TimeoutExpired:
240
+ self.last_exit_code = 124
241
+ return "Command timed out after 10 seconds.", "timeout"
242
+ except Exception as exc:
243
+ self.last_exit_code = 1
244
+ return f"Command execution error: {exc}", str(exc)
245
+
246
+ def _command_signature(self, command: str) -> str:
247
+ return " ".join(command.strip().lower().split())
248
+
249
+ def _is_diagnostic_command(self, command: str) -> bool:
250
+ diagnostic_keywords = [
251
+ "cat",
252
+ "curl",
253
+ "redis-cli",
254
+ "ps",
255
+ "ls",
256
+ "grep",
257
+ "tail",
258
+ "jq",
259
+ "lrange",
260
+ "llen",
261
+ "keys",
262
+ "ttl",
263
+ "get",
264
+ ]
265
+ normalized = command.lower()
266
+ return any(keyword in normalized for keyword in diagnostic_keywords)
267
+
268
+ def _is_state_change_command(self, command: str) -> bool:
269
+ normalized = command.lower()
270
+ state_change_patterns = [
271
+ "kill -hup",
272
+ "redis-cli del",
273
+ "redis-cli lrem",
274
+ "redis-cli set",
275
+ "redis-cli flushdb",
276
+ "echo '{",
277
+ "> /mesh/",
278
+ "tee /mesh/",
279
+ ]
280
+ return any(pattern in normalized for pattern in state_change_patterns)
281
+
282
+ def _compute_reward(
283
+ self,
284
+ command: str,
285
+ current: Observation,
286
+ previous: Observation,
287
+ grader_score: float,
288
+ previous_grader_score: float,
289
+ command_error: str | None,
290
+ ) -> float:
291
+ if command_error == "no_command_provided":
292
+ return 0.0
293
+
294
+ if grader_score >= 0.95:
295
+ return 1.0
296
+
297
+ reward = grader_score * 0.75
298
+ signature = self._command_signature(command)
299
+ signature_count = self._command_counts.get(signature, 0) + 1
300
+ self._command_counts[signature] = signature_count
301
+
302
+ if (
303
+ self._is_diagnostic_command(command)
304
+ and signature not in self._seen_diagnostic_signatures
305
+ ):
306
+ reward += 0.02
307
+ self._seen_diagnostic_signatures.add(signature)
308
+
309
+ if self._is_state_change_command(command):
310
+ reward += 0.03
311
+
312
+ if grader_score > previous_grader_score + 1e-4:
313
+ reward += 0.15
314
+ else:
315
+ reward -= 0.05
316
+
317
+ if (
318
+ current.metrics.gateway_success_rate
319
+ > previous.metrics.gateway_success_rate + 1e-3
320
+ ):
321
+ reward += 0.05
322
+
323
+ if current.metrics.queue_depth < previous.metrics.queue_depth:
324
+ reward += 0.05
325
+
326
+ if current.metrics.worker_restart_count < previous.metrics.worker_restart_count:
327
+ reward += 0.03
328
+
329
+ if current.metrics.consumer_stall_count < previous.metrics.consumer_stall_count:
330
+ reward += 0.03
331
+
332
+ if signature_count > 1:
333
+ reward -= min(0.12, 0.04 * (signature_count - 1))
334
+
335
+ if command.strip().lower() in {
336
+ "echo",
337
+ "pwd",
338
+ "whoami",
339
+ "date",
340
+ "true",
341
+ "false",
342
+ }:
343
+ reward -= 0.08
344
+
345
+ if self.last_exit_code != 0 and command_error not in {
346
+ "blocked_command",
347
+ "no_command_provided",
348
+ }:
349
+ reward -= 0.08
350
+
351
+ if command_error == "blocked_command":
352
+ reward -= 0.25
353
+
354
+ return max(0.0, min(1.0, reward))
355
+
356
+ def _status_block(self, metrics: Any) -> str:
357
+ return (
358
+ "=== pipeline status after reset ===\n"
359
+ "gateway: running\n"
360
+ "auth: running\n"
361
+ "worker: running\n"
362
+ f"queue_depth: {metrics.queue_depth}\n"
363
+ f"gateway_success_rate: {metrics.gateway_success_rate:.2f}"
364
+ )
365
+
366
+ def reset(self, task_name: TaskName | str) -> Observation:
367
+ task = TaskName.parse(task_name) if isinstance(task_name, str) else task_name
368
+
369
+ self.current_task = task
370
+ self.max_steps = TASK_MAX_STEPS[task]
371
+ self.step_count = 0
372
+ self._seen_diagnostic_signatures = set()
373
+ self._command_counts = {}
374
+ self._last_grader_score = 0.0
375
+
376
+ self._truncate_logs()
377
+ self._restore_defaults()
378
+ self._redis_flush()
379
+ self._reset_runtime_counters()
380
+
381
+ Path("/tmp/current_task").write_text(task.value, encoding="utf-8")
382
+
383
+ self._process_manager.restart_all()
384
+ if not self._process_manager.wait_healthy(timeout_s=30):
385
+ raise RuntimeError("Services failed health checks after reset")
386
+
387
+ inject_fault(task, self._process_manager)
388
+ time.sleep(1.0)
389
+
390
+ self._metrics_poller.poll_once()
391
+ metrics = self._metrics_poller.get_current_metrics()
392
+
393
+ self._baselines = {
394
+ "baseline_worker_restart_count": metrics.worker_restart_count,
395
+ "baseline_consumer_stall_count": metrics.consumer_stall_count,
396
+ }
397
+ self._last_grader_score = grade_task(
398
+ task, metrics, self._build_grader_context()
399
+ )
400
+
401
+ observation = Observation(
402
+ command_output=self._status_block(metrics),
403
+ metrics=metrics,
404
+ process_status=self._process_manager.get_status(),
405
+ )
406
+ self.prev_observation = observation
407
+ return observation
408
+
409
+ def step(self, action: Action) -> StepResult:
410
+ if not self.current_task:
411
+ raise RuntimeError(
412
+ "Environment not initialized. Call reset(task_name) first."
413
+ )
414
+
415
+ self.step_count += 1
416
+ command_output, command_error = self._run_command(action.command)
417
+
418
+ self._metrics_poller.poll_once()
419
+ metrics = self._metrics_poller.get_current_metrics()
420
+
421
+ observation = Observation(
422
+ command_output=command_output,
423
+ metrics=metrics,
424
+ process_status=self._process_manager.get_status(),
425
+ )
426
+
427
+ previous = self.prev_observation or observation
428
+ previous_grader_score = self._last_grader_score
429
+ grader_score = grade_task(
430
+ self.current_task, metrics, self._build_grader_context()
431
+ )
432
+ reward = self._compute_reward(
433
+ action.command,
434
+ observation,
435
+ previous,
436
+ grader_score,
437
+ previous_grader_score,
438
+ command_error,
439
+ )
440
+ if command_error == "no_command_provided":
441
+ done = self.step_count >= self.max_steps
442
+ else:
443
+ done = grader_score >= 0.95 or self.step_count >= self.max_steps
444
+
445
+ self._last_grader_score = grader_score
446
+ self.prev_observation = observation
447
+
448
+ info: dict[str, Any] = {
449
+ "grader_score": round(grader_score, 4),
450
+ "error": command_error,
451
+ "exit_code": self.last_exit_code,
452
+ "task": self.current_task.value if self.current_task else None,
453
+ }
454
+
455
+ return StepResult(observation=observation, reward=reward, done=done, info=info)
456
+
457
+ def state(self) -> dict[str, Any]:
458
+ self._metrics_poller.poll_once()
459
+ metrics = self._metrics_poller.get_current_metrics()
460
+ return {
461
+ "task": self.current_task.value if self.current_task else None,
462
+ "step_count": self.step_count,
463
+ "max_steps": self.max_steps,
464
+ "metrics": metrics.model_dump(),
465
+ "process_status": self._process_manager.get_status(),
466
+ "baselines": dict(self._baselines),
467
+ }
server/fault_injector.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import subprocess
3
+ from pathlib import Path
4
+
5
+ from .constants import TaskName
6
+ from .process_manager import ProcessManager
7
+
8
+
9
+ def _write_json(path: Path, payload: dict) -> None:
10
+ path.parent.mkdir(parents=True, exist_ok=True)
11
+ path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
12
+
13
+
14
+ def inject_cascading_timeout(pm: ProcessManager) -> None:
15
+ _write_json(pm.mesh_root / "auth" / "config.json", {"delay_ms": 1500})
16
+ _write_json(pm.mesh_root / "gateway" / "config.json", {"auth_timeout_ms": 500})
17
+ pm.sighup("auth")
18
+ pm.sighup("gateway")
19
+
20
+
21
+ def inject_byzantine_queue_fault(pm: ProcessManager) -> None:
22
+ subprocess.run(
23
+ ["redis-cli", "LPUSH", "job_queue", '{"id":"poison-001","payload":{{BROKEN'],
24
+ check=True,
25
+ )
26
+
27
+
28
+ def inject_distributed_lock_starvation(pm: ProcessManager) -> None:
29
+ subprocess.run(
30
+ ["redis-cli", "SET", "LOCK:job_processor", "dead-worker-pid-9999"], check=True
31
+ )
32
+
33
+
34
+ def inject_backpressure_cascade(pm: ProcessManager) -> None:
35
+ _write_json(
36
+ pm.mesh_root / "worker" / "config.json",
37
+ {"db_pool_size": 1, "db_write_delay_ms": 800},
38
+ )
39
+ pm.sighup("worker")
40
+
41
+
42
+ def inject_route_partition(pm: ProcessManager) -> None:
43
+ _write_json(
44
+ pm.mesh_root / "gateway" / "blocked_routes.json",
45
+ {"blocked": ["gateway->redis"]},
46
+ )
47
+ pm.sighup("gateway")
48
+
49
+
50
+ def inject_registry_corruption(pm: ProcessManager) -> None:
51
+ _write_json(
52
+ pm.mesh_root / "registry.json",
53
+ {
54
+ "services": {
55
+ "auth": {"host": "invalid-auth-host", "port": 3001, "protocol": "http"},
56
+ "redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
57
+ "worker": {"host": "localhost", "port": None, "protocol": "internal"},
58
+ }
59
+ },
60
+ )
61
+ pm.sighup("gateway")
62
+
63
+
64
+ def inject_job_generator_runaway(pm: ProcessManager) -> None:
65
+ _write_json(
66
+ pm.mesh_root / "worker" / "job_generator_config.json", {"interval_ms": 10}
67
+ )
68
+ pm.sighup("job_generator")
69
+
70
+
71
+ def inject_fault(task_name: TaskName | str, pm: ProcessManager) -> None:
72
+ task = TaskName.parse(task_name) if isinstance(task_name, str) else task_name
73
+
74
+ if task is TaskName.CASCADING_TIMEOUT:
75
+ inject_cascading_timeout(pm)
76
+ return
77
+ if task is TaskName.BYZANTINE_QUEUE_FAULT:
78
+ inject_byzantine_queue_fault(pm)
79
+ return
80
+ if task is TaskName.DISTRIBUTED_LOCK_STARVATION:
81
+ inject_distributed_lock_starvation(pm)
82
+ return
83
+ if task is TaskName.BACKPRESSURE_CASCADE:
84
+ inject_backpressure_cascade(pm)
85
+ return
86
+ if task is TaskName.ROUTE_PARTITION:
87
+ inject_route_partition(pm)
88
+ return
89
+ if task is TaskName.REGISTRY_CORRUPTION:
90
+ inject_registry_corruption(pm)
91
+ return
92
+ if task is TaskName.JOB_GENERATOR_RUNAWAY:
93
+ inject_job_generator_runaway(pm)
94
+ return
95
+ raise ValueError(f"Unknown task: {task_name}")
server/graders.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from .constants import TaskName
4
+ from .models import SystemMetrics
5
+
6
+
7
+ def _clamp(score: float) -> float:
8
+ return max(0.0, min(1.0, score))
9
+
10
+
11
+ def grade_cascading_timeout(metrics: SystemMetrics, context: dict[str, Any]) -> float:
12
+ timeout_resolved = bool(context.get("cascading_timeout_resolved", False))
13
+ if timeout_resolved and metrics.gateway_success_rate >= 0.99:
14
+ return 1.0
15
+ if not timeout_resolved:
16
+ # Prevent instant pass while the injected timeout fault is still active.
17
+ return _clamp(metrics.gateway_success_rate * 0.25)
18
+ return _clamp(0.4 + metrics.gateway_success_rate * 0.4)
19
+
20
+
21
+ def grade_byzantine_queue_fault(
22
+ metrics: SystemMetrics, context: dict[str, Any]
23
+ ) -> float:
24
+ baseline_restart = int(context.get("baseline_worker_restart_count", 0))
25
+ restart_delta = max(0, metrics.worker_restart_count - baseline_restart)
26
+
27
+ if metrics.queue_depth == 0 and restart_delta <= 1:
28
+ return 1.0
29
+ if metrics.queue_depth == 0:
30
+ return 0.6
31
+
32
+ queue_component = max(0.0, 1.0 - metrics.queue_depth / 50.0)
33
+ stability_penalty = min(0.4, restart_delta * 0.05)
34
+ return _clamp(queue_component - stability_penalty)
35
+
36
+
37
+ def grade_distributed_lock_starvation(
38
+ metrics: SystemMetrics, context: dict[str, Any]
39
+ ) -> float:
40
+ lock_exists = bool(context.get("lock_exists", True))
41
+ baseline_stall = int(context.get("baseline_consumer_stall_count", 0))
42
+ stall_delta = max(0, metrics.consumer_stall_count - baseline_stall)
43
+
44
+ if not lock_exists and metrics.queue_depth <= 3:
45
+ return 1.0
46
+ if not lock_exists:
47
+ return 0.6
48
+
49
+ # If lock still exists, reward slight progress only when stalls don't explode.
50
+ return 0.2 if stall_delta <= 1 else 0.0
51
+
52
+
53
+ def grade_backpressure_cascade(metrics: SystemMetrics, _: dict[str, Any]) -> float:
54
+ return _clamp(1.0 - (metrics.queue_depth / 200.0))
55
+
56
+
57
+ def grade_route_partition(metrics: SystemMetrics, context: dict[str, Any]) -> float:
58
+ route_blocked = bool(context.get("route_blocked", True))
59
+ if not route_blocked and metrics.gateway_success_rate >= 0.95:
60
+ return 1.0
61
+ if not route_blocked:
62
+ return _clamp(metrics.gateway_success_rate)
63
+ return 0.0
64
+
65
+
66
+ def grade_registry_corruption(metrics: SystemMetrics, context: dict[str, Any]) -> float:
67
+ registry_auth_matches_default = bool(
68
+ context.get("registry_auth_matches_default", False)
69
+ )
70
+ if registry_auth_matches_default and metrics.gateway_success_rate >= 0.99:
71
+ return 1.0
72
+ if registry_auth_matches_default:
73
+ return _clamp(0.5 + metrics.gateway_success_rate * 0.5)
74
+ return _clamp(metrics.gateway_success_rate * 0.3)
75
+
76
+
77
+ def grade_job_generator_runaway(
78
+ metrics: SystemMetrics, context: dict[str, Any]
79
+ ) -> float:
80
+ rate_resolved = bool(context.get("job_generator_rate_resolved", False))
81
+ if rate_resolved and metrics.queue_depth <= 5:
82
+ return 1.0
83
+ if rate_resolved and metrics.queue_depth <= 30:
84
+ return 0.7
85
+ if rate_resolved:
86
+ return _clamp(0.7 - (metrics.queue_depth - 30) / 100.0)
87
+ return 0.2 if metrics.queue_depth <= 30 else 0.0
88
+
89
+
90
+ def grade_task(
91
+ task_name: TaskName | str, metrics: SystemMetrics, context: dict[str, Any]
92
+ ) -> float:
93
+ task = TaskName.parse(task_name) if isinstance(task_name, str) else task_name
94
+
95
+ if task is TaskName.CASCADING_TIMEOUT:
96
+ return grade_cascading_timeout(metrics, context)
97
+ if task is TaskName.BYZANTINE_QUEUE_FAULT:
98
+ return grade_byzantine_queue_fault(metrics, context)
99
+ if task is TaskName.DISTRIBUTED_LOCK_STARVATION:
100
+ return grade_distributed_lock_starvation(metrics, context)
101
+ if task is TaskName.BACKPRESSURE_CASCADE:
102
+ return grade_backpressure_cascade(metrics, context)
103
+ if task is TaskName.ROUTE_PARTITION:
104
+ return grade_route_partition(metrics, context)
105
+ if task is TaskName.REGISTRY_CORRUPTION:
106
+ return grade_registry_corruption(metrics, context)
107
+ if task is TaskName.JOB_GENERATOR_RUNAWAY:
108
+ return grade_job_generator_runaway(metrics, context)
109
+ return 0.0
server/metrics_poller.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import threading
3
+ from pathlib import Path
4
+
5
+ import httpx
6
+
7
+ from .models import SystemMetrics
8
+
9
+
10
+ class MetricsPoller(threading.Thread):
11
+ """Background metrics poller with last-known-good caching."""
12
+
13
+ def __init__(self, poll_interval_s: float = 2.0) -> None:
14
+ super().__init__(daemon=True)
15
+ self.poll_interval_s = poll_interval_s
16
+ self._stop_event = threading.Event()
17
+ self._lock = threading.Lock()
18
+ self._latest: dict[str, float | int] = {
19
+ "gateway_success_rate": 0.0,
20
+ "gateway_p99_latency_ms": 0.0,
21
+ "queue_depth": 0,
22
+ "worker_restart_count": 0,
23
+ "consumer_stall_count": 0,
24
+ }
25
+
26
+ def stop(self) -> None:
27
+ self._stop_event.set()
28
+
29
+ def _read_counter(self, path: str) -> int:
30
+ file_path = Path(path)
31
+ if not file_path.exists():
32
+ return 0
33
+ try:
34
+ return int(file_path.read_text().strip() or "0")
35
+ except ValueError:
36
+ return 0
37
+
38
+ def _poll_gateway(self) -> dict[str, float]:
39
+ with httpx.Client(timeout=1.0) as client:
40
+ response = client.get("http://localhost:3000/health")
41
+ response.raise_for_status()
42
+ payload = response.json()
43
+
44
+ success_rate = float(
45
+ payload.get("success_rate", payload.get("gateway_success_rate", 0.0))
46
+ )
47
+ p99 = float(
48
+ payload.get("p99_latency_ms", payload.get("gateway_p99_latency_ms", 0.0))
49
+ )
50
+ return {
51
+ "gateway_success_rate": max(0.0, min(1.0, success_rate)),
52
+ "gateway_p99_latency_ms": max(0.0, p99),
53
+ }
54
+
55
+ def _poll_queue_depth(self) -> int:
56
+ result = subprocess.run(
57
+ ["redis-cli", "LLEN", "job_queue"],
58
+ capture_output=True,
59
+ text=True,
60
+ timeout=2,
61
+ check=False,
62
+ )
63
+ if result.returncode != 0:
64
+ return int(self._latest["queue_depth"])
65
+ try:
66
+ return max(0, int(result.stdout.strip() or "0"))
67
+ except ValueError:
68
+ return int(self._latest["queue_depth"])
69
+
70
+ def poll_once(self) -> None:
71
+ snapshot = dict(self._latest)
72
+ try:
73
+ snapshot.update(self._poll_gateway())
74
+ except Exception:
75
+ pass
76
+
77
+ snapshot["queue_depth"] = self._poll_queue_depth()
78
+ snapshot["worker_restart_count"] = self._read_counter(
79
+ "/tmp/worker_restart_count"
80
+ )
81
+ snapshot["consumer_stall_count"] = self._read_counter(
82
+ "/tmp/consumer_stall_count"
83
+ )
84
+
85
+ with self._lock:
86
+ self._latest = snapshot
87
+
88
+ def run(self) -> None:
89
+ while not self._stop_event.is_set():
90
+ self.poll_once()
91
+ self._stop_event.wait(self.poll_interval_s)
92
+
93
+ def get_current_metrics(self) -> SystemMetrics:
94
+ with self._lock:
95
+ snapshot = dict(self._latest)
96
+ return SystemMetrics.model_validate(snapshot)
server/models.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field, field_validator
4
+
5
+
6
+ class SystemMetrics(BaseModel):
7
+ gateway_success_rate: float = Field(..., ge=0.0, le=1.0)
8
+ gateway_p99_latency_ms: float = Field(..., ge=0.0)
9
+ queue_depth: int = Field(..., ge=0)
10
+ worker_restart_count: int = Field(..., ge=0)
11
+ consumer_stall_count: int = Field(..., ge=0)
12
+
13
+
14
+ class Observation(BaseModel):
15
+ command_output: str = Field(
16
+ ..., description="stdout+stderr from the last executed command"
17
+ )
18
+ metrics: SystemMetrics
19
+ process_status: dict[str, str] = Field(default_factory=dict)
20
+
21
+
22
+ class Action(BaseModel):
23
+ command: str = Field(..., description="Single bash command to execute")
24
+
25
+ @field_validator("command")
26
+ @classmethod
27
+ def command_must_not_be_empty(cls, value: str) -> str:
28
+ if not value.strip():
29
+ raise ValueError("command must not be empty")
30
+ return value
31
+
32
+
33
+ class Reward(BaseModel):
34
+ value: float = Field(..., ge=0.0, le=1.0)
35
+
36
+
37
+ class StepResult(BaseModel):
38
+ observation: Observation
39
+ reward: float = Field(..., ge=0.0, le=1.0)
40
+ done: bool
41
+ info: dict[str, Any] = Field(default_factory=dict)
server/process_manager.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import signal
3
+ import subprocess
4
+ import time
5
+ from pathlib import Path
6
+
7
+ import httpx
8
+
9
+
10
+ class ProcessManager:
11
+ """Tracks Bun service processes and lifecycle state."""
12
+
13
+ def __init__(
14
+ self, project_root: Path | None = None, mesh_root: Path | None = None
15
+ ) -> None:
16
+ self.project_root = (project_root or Path(__file__).resolve().parent.parent).resolve()
17
+ self.mesh_root = (
18
+ mesh_root or Path(os.getenv("MESH_ROOT", self.project_root / "mesh"))
19
+ ).resolve()
20
+
21
+ self._service_scripts = {
22
+ "gateway": self.project_root / "mesh" / "gateway" / "index.ts",
23
+ "auth": self.project_root / "mesh" / "auth" / "index.ts",
24
+ "worker": self.project_root / "mesh" / "worker" / "index.ts",
25
+ }
26
+ self._job_generator_script = (
27
+ self.project_root / "mesh" / "worker" / "job_generator.ts"
28
+ )
29
+ self._health_urls = {
30
+ "gateway": "http://localhost:3000/health",
31
+ "auth": "http://localhost:3001/health",
32
+ }
33
+
34
+ self._processes: dict[str, subprocess.Popen[str]] = {}
35
+ self._log_handles: dict[str, object] = {}
36
+
37
+ @staticmethod
38
+ def _pid_path(service: str) -> Path:
39
+ return Path(f"/tmp/{service}.pid")
40
+
41
+ @staticmethod
42
+ def _is_pid_alive(pid: int) -> bool:
43
+ try:
44
+ os.kill(pid, 0)
45
+ return True
46
+ except OSError:
47
+ return False
48
+
49
+ def _read_pid(self, service: str) -> int | None:
50
+ path = self._pid_path(service)
51
+ if not path.exists():
52
+ return None
53
+ try:
54
+ pid = int(path.read_text().strip())
55
+ except (TypeError, ValueError):
56
+ return None
57
+ return pid if self._is_pid_alive(pid) else None
58
+
59
+ def _write_pid(self, service: str, pid: int) -> None:
60
+ self._pid_path(service).write_text(str(pid))
61
+
62
+ def _spawn_service(self, service: str, script: Path, log_path: Path) -> None:
63
+ log_handle = open(log_path, "a", encoding="utf-8")
64
+ env = {
65
+ **os.environ,
66
+ "MESH_ROOT": str(self.mesh_root),
67
+ }
68
+ process = subprocess.Popen(
69
+ ["bun", "run", str(script)],
70
+ cwd=str(self.project_root),
71
+ stdout=log_handle,
72
+ stderr=subprocess.STDOUT,
73
+ text=True,
74
+ env=env,
75
+ )
76
+ self._processes[service] = process
77
+ self._log_handles[service] = log_handle
78
+ self._write_pid(service, process.pid)
79
+
80
+ def start_all(self) -> None:
81
+ for service, script in self._service_scripts.items():
82
+ existing_pid = self._read_pid(service)
83
+ if existing_pid:
84
+ continue
85
+ self._spawn_service(service, script, Path(f"/tmp/{service}.log"))
86
+
87
+ if not self._read_pid("job_generator"):
88
+ self._spawn_service(
89
+ "job_generator", self._job_generator_script, Path("/tmp/job_gen.log")
90
+ )
91
+
92
+ def _terminate_pid(self, pid: int, timeout_s: float = 0.5) -> None:
93
+ try:
94
+ os.kill(pid, signal.SIGTERM)
95
+ except ProcessLookupError:
96
+ return
97
+
98
+ deadline = time.time() + timeout_s
99
+ while time.time() < deadline:
100
+ if not self._is_pid_alive(pid):
101
+ return
102
+ time.sleep(0.05)
103
+
104
+ try:
105
+ os.kill(pid, signal.SIGKILL)
106
+ except ProcessLookupError:
107
+ return
108
+
109
+ def stop_all(self) -> None:
110
+ for service in ["gateway", "auth", "worker", "job_generator"]:
111
+ pid = self._read_pid(service)
112
+ if pid:
113
+ self._terminate_pid(pid)
114
+ pid_path = self._pid_path(service)
115
+ if pid_path.exists():
116
+ pid_path.unlink(missing_ok=True)
117
+
118
+ for handle in self._log_handles.values():
119
+ try:
120
+ handle.close()
121
+ except Exception:
122
+ pass
123
+
124
+ self._processes.clear()
125
+ self._log_handles.clear()
126
+
127
+ def restart_all(self) -> None:
128
+ self.stop_all()
129
+ self.start_all()
130
+
131
+ def sighup(self, service: str) -> None:
132
+ pid = self._read_pid(service)
133
+ if not pid:
134
+ raise RuntimeError(f"Service not running: {service}")
135
+ os.kill(pid, signal.SIGHUP)
136
+
137
+ def wait_healthy(self, timeout_s: int = 30) -> bool:
138
+ deadline = time.time() + timeout_s
139
+ with httpx.Client(timeout=1.0) as client:
140
+ while time.time() < deadline:
141
+ try:
142
+ gateway_ok = (
143
+ client.get(self._health_urls["gateway"]).status_code == 200
144
+ )
145
+ auth_ok = client.get(self._health_urls["auth"]).status_code == 200
146
+ if gateway_ok and auth_ok:
147
+ return True
148
+ except Exception:
149
+ pass
150
+ time.sleep(1)
151
+ return False
152
+
153
+ def get_status(self) -> dict[str, str]:
154
+ status: dict[str, str] = {}
155
+ for service in ["gateway", "auth", "worker", "job_generator"]:
156
+ pid = self._read_pid(service)
157
+ status[service] = f"running pid={pid}" if pid else "stopped"
158
+ return status
159
+
160
+ def get_pid(self, service: str) -> int | None:
161
+ return self._read_pid(service)
162
+
163
+ def close(self) -> None:
164
+ self.stop_all()
setup-dev.sh ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
5
+ VENV_DIR="${ROOT_DIR}/.venv"
6
+ REQUIRED_PYTHON="3.12"
7
+ PYTHON_BIN="python3.12"
8
+
9
+ if ! command -v "$PYTHON_BIN" >/dev/null && command -v python3 >/dev/null; then
10
+ if [ "$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" = "$REQUIRED_PYTHON" ]; then
11
+ PYTHON_BIN="python3"
12
+ fi
13
+ fi
14
+
15
+ require_cmd() {
16
+ local name="$1"
17
+ local install_hint="$2"
18
+ if ! command -v "$name" >/dev/null; then
19
+ echo "[ERROR] Missing required command: $name"
20
+ echo " Install hint: $install_hint"
21
+ exit 1
22
+ fi
23
+ }
24
+
25
+ optional_cmd_note() {
26
+ local name="$1"
27
+ local note="$2"
28
+ if ! command -v "$name" >/dev/null; then
29
+ echo "[WARN] Optional command not found: $name"
30
+ echo " $note"
31
+ fi
32
+ }
33
+
34
+ echo "[INFO] Preflight checks"
35
+ require_cmd "$PYTHON_BIN" "Install Python 3.12 (binary: python3.12 or python3==3.12)"
36
+ require_cmd bun "Install Bun: https://bun.sh"
37
+ require_cmd redis-server "Install Redis server"
38
+ require_cmd redis-cli "Install Redis CLI"
39
+ require_cmd curl "Install curl"
40
+ require_cmd jq "Install jq"
41
+
42
+ optional_cmd_note docker "Needed for local container validation and HF parity checks."
43
+ optional_cmd_note uv "Used for regenerating uv.lock when dependencies change."
44
+
45
+ echo "[INFO] Creating virtual environment"
46
+ if [ -x "$VENV_DIR/bin/python" ]; then
47
+ EXISTING_PY_VERSION="$($VENV_DIR/bin/python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
48
+ if [ "$EXISTING_PY_VERSION" != "$REQUIRED_PYTHON" ]; then
49
+ echo "[INFO] Recreating .venv with Python 3.12 (found $EXISTING_PY_VERSION)"
50
+ rm -rf "$VENV_DIR"
51
+ fi
52
+ fi
53
+
54
+ if [ ! -d "$VENV_DIR" ]; then
55
+ "$PYTHON_BIN" -m venv "$VENV_DIR"
56
+ fi
57
+
58
+ # shellcheck disable=SC1091
59
+ source "$VENV_DIR/bin/activate"
60
+
61
+ python -m pip install --upgrade pip
62
+ pip install -r "$ROOT_DIR/requirements.txt"
63
+
64
+ echo "[INFO] Installing Bun dependencies"
65
+ ( cd "$ROOT_DIR/mesh/gateway" && bun install )
66
+ ( cd "$ROOT_DIR/mesh/auth" && bun install )
67
+ ( cd "$ROOT_DIR/mesh/worker" && bun install )
68
+
69
+ chmod +x "$ROOT_DIR/start.sh"
70
+ chmod +x "$ROOT_DIR/inference.py" || true
71
+
72
+ if command -v uv >/dev/null && [ ! -f "$ROOT_DIR/uv.lock" ]; then
73
+ echo "[INFO] Generating uv.lock"
74
+ ( cd "$ROOT_DIR" && uv lock )
75
+ fi
76
+
77
+ echo "[INFO] Running OpenEnv validation"
78
+ openenv validate "$ROOT_DIR"
79
+
80
+ echo "[INFO] Setup complete"
81
+ echo "[NEXT] Export required inference vars:"
82
+ echo " API_BASE_URL=<endpoint>"
83
+ echo " MODEL_NAME=<model>"
84
+ echo " HF_TOKEN=<api_key>"
85
+ echo "[NEXT] Start services: APP_ROOT=$ROOT_DIR MESH_ROOT=$ROOT_DIR/mesh ./start.sh"
86
+ echo "[NEXT] Run baseline: HF_TOKEN=... API_BASE_URL=... MODEL_NAME=... python inference.py"
start.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ export APP_ROOT="${APP_ROOT:-/home/user/app}"
5
+ export MESH_ROOT="${MESH_ROOT:-/mesh}"
6
+
7
+ mkdir -p /tmp
8
+
9
+ redis-server --daemonize yes --logfile /tmp/redis.log --port 6379
10
+ until redis-cli ping >/dev/null; do sleep 0.2; done
11
+
12
+ rm -f /tmp/*.pid /tmp/*.log /tmp/worker_restart_count /tmp/consumer_stall_count /tmp/current_task
13
+ printf '0' > /tmp/worker_restart_count
14
+ printf '0' > /tmp/consumer_stall_count
15
+
16
+ cat > "${MESH_ROOT}/registry.json" <<'EOF'
17
+ {
18
+ "services": {
19
+ "auth": {"host": "localhost", "port": 3001, "protocol": "http"},
20
+ "redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
21
+ "worker": {"host": "localhost", "port": null, "protocol": "internal"}
22
+ }
23
+ }
24
+ EOF
25
+
26
+ : > /tmp/gateway.log
27
+ : > /tmp/auth.log
28
+ : > /tmp/worker.log
29
+ : > /tmp/job_gen.log
30
+
31
+ bun run "${APP_ROOT}/mesh/gateway/index.ts" >> /tmp/gateway.log &
32
+ echo $! > /tmp/gateway.pid
33
+
34
+ bun run "${APP_ROOT}/mesh/auth/index.ts" >> /tmp/auth.log &
35
+ echo $! > /tmp/auth.pid
36
+
37
+ bun run "${APP_ROOT}/mesh/worker/index.ts" >> /tmp/worker.log &
38
+ echo $! > /tmp/worker.pid
39
+
40
+ bun run "${APP_ROOT}/mesh/worker/job_generator.ts" >> /tmp/job_gen.log &
41
+ echo $! > /tmp/job_generator.pid
42
+
43
+ for _ in $(seq 1 45); do
44
+ if curl -sf http://localhost:3000/health >/dev/null && curl -sf http://localhost:3001/health >/dev/null; then
45
+ break
46
+ fi
47
+ sleep 1
48
+ done
49
+
50
+ exec uvicorn server.api:app --host 0.0.0.0 --port 8000
tests/test_api_reset.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+
3
+ from server import api
4
+ from server.constants import TaskName
5
+ from server.models import Observation, SystemMetrics
6
+
7
+
8
+ class _FakeEnv:
9
+ def __init__(self) -> None:
10
+ self.reset_calls: list[TaskName] = []
11
+
12
+ def start(self) -> None:
13
+ return None
14
+
15
+ def close(self) -> None:
16
+ return None
17
+
18
+ def reset(self, task_name: TaskName) -> Observation:
19
+ self.reset_calls.append(task_name)
20
+ return Observation(
21
+ command_output="ready",
22
+ metrics=SystemMetrics(
23
+ gateway_success_rate=0.0,
24
+ gateway_p99_latency_ms=0.0,
25
+ queue_depth=0,
26
+ worker_restart_count=0,
27
+ consumer_stall_count=0,
28
+ ),
29
+ process_status={"gateway": "running"},
30
+ )
31
+
32
+
33
+ def test_reset_defaults_to_cascading_timeout_when_task_missing(monkeypatch) -> None:
34
+ holder: dict[str, _FakeEnv] = {}
35
+
36
+ def fake_env_factory() -> _FakeEnv:
37
+ env = _FakeEnv()
38
+ holder["env"] = env
39
+ return env
40
+
41
+ monkeypatch.setattr(api, "DistributedDebugEnv", fake_env_factory)
42
+
43
+ with TestClient(api.app) as client:
44
+ response = client.post("/reset", json={})
45
+
46
+ assert response.status_code == 200
47
+ assert holder["env"].reset_calls == [TaskName.CASCADING_TIMEOUT]
48
+
49
+
50
+ def test_reset_rejects_unknown_explicit_task(monkeypatch) -> None:
51
+ holder: dict[str, _FakeEnv] = {}
52
+
53
+ def fake_env_factory() -> _FakeEnv:
54
+ env = _FakeEnv()
55
+ holder["env"] = env
56
+ return env
57
+
58
+ monkeypatch.setattr(api, "DistributedDebugEnv", fake_env_factory)
59
+
60
+ with TestClient(api.app) as client:
61
+ response = client.post("/reset", params={"task_name": "not-a-task"}, json={})
62
+
63
+ assert response.status_code == 400
64
+ assert response.json()["detail"] == "Unknown task: not-a-task"
65
+ assert holder["env"].reset_calls == []
tests/test_env_task_context.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from server.constants import DEFAULT_CONFIGS
5
+ from server.env import DistributedDebugEnv
6
+
7
+
8
+ def _write_json(path: Path, payload: dict) -> None:
9
+ path.parent.mkdir(parents=True, exist_ok=True)
10
+ path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
11
+
12
+
13
+ def test_restore_defaults_adds_job_generator_config(tmp_path: Path) -> None:
14
+ mesh_root = tmp_path / "mesh"
15
+ env = DistributedDebugEnv(project_root=tmp_path, mesh_root=mesh_root)
16
+
17
+ env._restore_defaults()
18
+
19
+ payload = json.loads(
20
+ (mesh_root / "worker" / "job_generator_config.json").read_text()
21
+ )
22
+ assert payload == DEFAULT_CONFIGS["job_generator"]
23
+
24
+
25
+ def test_registry_auth_matches_default_detects_corruption(tmp_path: Path) -> None:
26
+ mesh_root = tmp_path / "mesh"
27
+ env = DistributedDebugEnv(project_root=tmp_path, mesh_root=mesh_root)
28
+ env._restore_defaults()
29
+
30
+ assert env._is_registry_auth_default() is True
31
+
32
+ _write_json(
33
+ mesh_root / "registry.json",
34
+ {
35
+ "services": {
36
+ "auth": {"host": "invalid-host", "port": 3001, "protocol": "http"},
37
+ "redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
38
+ "worker": {"host": "localhost", "port": None, "protocol": "internal"},
39
+ }
40
+ },
41
+ )
42
+
43
+ assert env._is_registry_auth_default() is False
44
+
45
+
46
+ def test_job_generator_rate_resolved_uses_config(tmp_path: Path) -> None:
47
+ mesh_root = tmp_path / "mesh"
48
+ env = DistributedDebugEnv(project_root=tmp_path, mesh_root=mesh_root)
49
+ env._restore_defaults()
50
+
51
+ assert env._job_generator_interval_ms() == 333
52
+ assert env._is_job_generator_rate_resolved() is True
53
+
54
+ _write_json(mesh_root / "worker" / "job_generator_config.json", {"interval_ms": 10})
55
+
56
+ assert env._job_generator_interval_ms() == 10
57
+ assert env._is_job_generator_rate_resolved() is False
tests/test_graders.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from server.graders import (
2
+ grade_backpressure_cascade,
3
+ grade_byzantine_queue_fault,
4
+ grade_cascading_timeout,
5
+ grade_job_generator_runaway,
6
+ grade_registry_corruption,
7
+ grade_distributed_lock_starvation,
8
+ grade_route_partition,
9
+ )
10
+ from server.models import SystemMetrics
11
+
12
+
13
+ def _metrics(
14
+ *,
15
+ success_rate: float = 0.0,
16
+ p99: float = 1000.0,
17
+ depth: int = 10,
18
+ restarts: int = 0,
19
+ stalls: int = 0,
20
+ ) -> SystemMetrics:
21
+ return SystemMetrics(
22
+ gateway_success_rate=success_rate,
23
+ gateway_p99_latency_ms=p99,
24
+ queue_depth=depth,
25
+ worker_restart_count=restarts,
26
+ consumer_stall_count=stalls,
27
+ )
28
+
29
+
30
+ def test_grade_cascading_timeout_boundaries() -> None:
31
+ assert (
32
+ grade_cascading_timeout(
33
+ _metrics(success_rate=1.0), {"cascading_timeout_resolved": True}
34
+ )
35
+ == 1.0
36
+ )
37
+ assert (
38
+ grade_cascading_timeout(
39
+ _metrics(success_rate=1.0), {"cascading_timeout_resolved": False}
40
+ )
41
+ == 0.25
42
+ )
43
+ assert (
44
+ grade_cascading_timeout(
45
+ _metrics(success_rate=0.5), {"cascading_timeout_resolved": False}
46
+ )
47
+ == 0.125
48
+ )
49
+
50
+
51
+ def test_grade_byzantine_queue_fault_cases() -> None:
52
+ ctx = {"baseline_worker_restart_count": 3}
53
+ assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=3), ctx) == 1.0
54
+ assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=8), ctx) == 0.6
55
+ assert grade_byzantine_queue_fault(_metrics(depth=40, restarts=10), ctx) == 0.0
56
+
57
+
58
+ def test_grade_distributed_lock_starvation_cases() -> None:
59
+ ctx_locked = {"baseline_consumer_stall_count": 0, "lock_exists": True}
60
+ ctx_unlocked = {"baseline_consumer_stall_count": 0, "lock_exists": False}
61
+
62
+ assert (
63
+ grade_distributed_lock_starvation(_metrics(depth=2, stalls=0), ctx_unlocked)
64
+ == 1.0
65
+ )
66
+ assert (
67
+ grade_distributed_lock_starvation(_metrics(depth=10, stalls=0), ctx_unlocked)
68
+ == 0.6
69
+ )
70
+ assert (
71
+ grade_distributed_lock_starvation(_metrics(depth=10, stalls=3), ctx_locked)
72
+ == 0.0
73
+ )
74
+
75
+
76
+ def test_grade_backpressure_cascade_continuous() -> None:
77
+ assert grade_backpressure_cascade(_metrics(depth=0), {}) == 1.0
78
+ assert grade_backpressure_cascade(_metrics(depth=100), {}) == 0.5
79
+ assert grade_backpressure_cascade(_metrics(depth=200), {}) == 0.0
80
+
81
+
82
+ def test_grade_route_partition_threshold() -> None:
83
+ assert (
84
+ grade_route_partition(_metrics(success_rate=0.96), {"route_blocked": False})
85
+ == 1.0
86
+ )
87
+ assert (
88
+ grade_route_partition(_metrics(success_rate=0.8), {"route_blocked": True})
89
+ == 0.0
90
+ )
91
+
92
+
93
+ def test_grade_registry_corruption_thresholds() -> None:
94
+ assert (
95
+ grade_registry_corruption(
96
+ _metrics(success_rate=0.99), {"registry_auth_matches_default": True}
97
+ )
98
+ == 1.0
99
+ )
100
+ assert (
101
+ grade_registry_corruption(
102
+ _metrics(success_rate=0.8), {"registry_auth_matches_default": True}
103
+ )
104
+ == 0.9
105
+ )
106
+ assert (
107
+ grade_registry_corruption(
108
+ _metrics(success_rate=1.0), {"registry_auth_matches_default": False}
109
+ )
110
+ == 0.3
111
+ )
112
+
113
+
114
+ def test_grade_job_generator_runaway_thresholds() -> None:
115
+ assert (
116
+ grade_job_generator_runaway(
117
+ _metrics(depth=4), {"job_generator_rate_resolved": True}
118
+ )
119
+ == 1.0
120
+ )
121
+ assert (
122
+ grade_job_generator_runaway(
123
+ _metrics(depth=20), {"job_generator_rate_resolved": True}
124
+ )
125
+ == 0.7
126
+ )
127
+ assert (
128
+ grade_job_generator_runaway(
129
+ _metrics(depth=20), {"job_generator_rate_resolved": False}
130
+ )
131
+ == 0.2
132
+ )
tests/test_inference_format.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from server.constants import TaskName
4
+ from inference import (
5
+ _attempt_history_block,
6
+ _episode_score,
7
+ _format_end_line,
8
+ _parse_tasks,
9
+ _single_line,
10
+ _task_symptom_block,
11
+ build_prompt,
12
+ extract_command,
13
+ extract_reasoning,
14
+ )
15
+ from server.models import Observation, SystemMetrics
16
+
17
+
18
+ def test_extract_command_rejects_non_json_code_fence() -> None:
19
+ raw = "```bash\nredis-cli LLEN job_queue\n```"
20
+ assert extract_command(raw) is None
21
+
22
+
23
+ def test_extract_command_returns_none_when_empty() -> None:
24
+ assert extract_command(" ") is None
25
+
26
+
27
+ def test_extract_command_reads_json_payload() -> None:
28
+ raw = '{"command":"redis-cli LLEN job_queue"}'
29
+ assert extract_command(raw) == "redis-cli LLEN job_queue"
30
+
31
+
32
+ def test_extract_command_reads_fenced_json_payload() -> None:
33
+ raw = '```json\n{"command":"ps -ef"}\n```'
34
+ assert extract_command(raw) == "ps -ef"
35
+
36
+
37
+ def test_extract_command_reads_json_embedded_in_text() -> None:
38
+ raw = 'Use this command: {"command":"redis-cli LLEN job_queue"} thanks.'
39
+ assert extract_command(raw) == "redis-cli LLEN job_queue"
40
+
41
+
42
+ def test_extract_command_reads_json_after_reasoning_preamble() -> None:
43
+ raw = (
44
+ "I'll start by checking process state.\n"
45
+ '{"command":"ps aux","reasoning":"list processes"}'
46
+ )
47
+ assert extract_command(raw) == "ps aux"
48
+ assert extract_reasoning(raw) == "list processes"
49
+
50
+
51
+ def test_extract_command_prefers_first_json_object_with_command() -> None:
52
+ raw = '{"meta":"skip"} then {"command":"ls -la","reasoning":"explore"}'
53
+ assert extract_command(raw) == "ls -la"
54
+
55
+
56
+ def test_extract_reasoning_when_present() -> None:
57
+ raw = '{"command":"redis-cli LLEN job_queue","reasoning":"check queue depth first"}'
58
+ assert extract_command(raw) == "redis-cli LLEN job_queue"
59
+ assert extract_reasoning(raw) == "check queue depth first"
60
+
61
+
62
+ def test_extract_command_requires_command_even_with_reasoning() -> None:
63
+ raw = '{"reasoning":"i should inspect logs"}'
64
+ assert extract_command(raw) is None
65
+ assert extract_reasoning(raw) is None
66
+
67
+
68
+ def test_single_line_removes_newlines() -> None:
69
+ assert _single_line("echo a\necho b") == "echo a echo b"
70
+
71
+
72
+ def test_task_symptom_block_is_non_empty() -> None:
73
+ block = _task_symptom_block(TaskName.ROUTE_PARTITION)
74
+ assert "connectivity path issue" in block
75
+ assert "route-partition" not in block
76
+
77
+
78
+ def test_task_symptom_block_includes_new_tasks() -> None:
79
+ registry_block = _task_symptom_block(TaskName.REGISTRY_CORRUPTION)
80
+ runaway_block = _task_symptom_block(TaskName.JOB_GENERATOR_RUNAWAY)
81
+
82
+ assert "registry" in registry_block.lower()
83
+ assert "queue" in runaway_block.lower()
84
+ assert "job-generator-runaway" not in runaway_block
85
+
86
+
87
+ def test_attempt_history_block_renders_all_attempts() -> None:
88
+ attempts = [
89
+ {
90
+ "step": 1,
91
+ "command": "redis-cli LLEN job_queue",
92
+ "reasoning": "check backlog",
93
+ "reward": 0.12,
94
+ "error": None,
95
+ },
96
+ {
97
+ "step": 2,
98
+ "command": "curl -s localhost:3000/health",
99
+ "reasoning": None,
100
+ "reward": 0.08,
101
+ "error": "timeout",
102
+ },
103
+ ]
104
+ block = _attempt_history_block(attempts)
105
+ assert "step 1: command=redis-cli LLEN job_queue" in block
106
+ assert "step 2: command=curl -s localhost:3000/health" in block
107
+ assert "reasoning=check backlog" in block
108
+ assert "error=timeout" in block
109
+ assert "reward=" not in block
110
+
111
+
112
+ def test_build_prompt_contains_symptoms_and_history() -> None:
113
+ obs = Observation(
114
+ command_output="service checks show partial failures",
115
+ metrics=SystemMetrics(
116
+ gateway_success_rate=0.32,
117
+ gateway_p99_latency_ms=1500.0,
118
+ queue_depth=412,
119
+ worker_restart_count=3,
120
+ consumer_stall_count=2,
121
+ ),
122
+ process_status={"gateway": "running", "worker": "running"},
123
+ )
124
+ prompt = build_prompt(
125
+ obs=obs,
126
+ step_num=3,
127
+ task_name=TaskName.BACKPRESSURE_CASCADE,
128
+ attempt_history=[
129
+ {
130
+ "step": 1,
131
+ "command": "redis-cli LLEN job_queue",
132
+ "reasoning": "measure backlog",
133
+ "reward": 0.10,
134
+ "error": None,
135
+ }
136
+ ],
137
+ )
138
+ assert "TASK SYMPTOMS:" in prompt
139
+ assert "PREVIOUS ATTEMPTS:" in prompt
140
+ assert "step 1: command=redis-cli LLEN job_queue" in prompt
141
+ assert "LATEST COMMAND OUTPUT:" in prompt
142
+ assert "reward=" not in prompt
143
+
144
+
145
+ def test_parse_tasks_default_and_override() -> None:
146
+ previous = os.getenv("TASKS_CSV")
147
+ try:
148
+ os.environ.pop("TASKS_CSV", None)
149
+ default_tasks = _parse_tasks()
150
+ assert default_tasks == [
151
+ TaskName.CASCADING_TIMEOUT,
152
+ TaskName.BYZANTINE_QUEUE_FAULT,
153
+ TaskName.DISTRIBUTED_LOCK_STARVATION,
154
+ ]
155
+
156
+ os.environ["TASKS_CSV"] = "route-partition,backpressure-cascade"
157
+ assert _parse_tasks() == [
158
+ TaskName.ROUTE_PARTITION,
159
+ TaskName.BACKPRESSURE_CASCADE,
160
+ ]
161
+
162
+ os.environ["TASKS_CSV"] = "registry-corruption,job-generator-runaway"
163
+ assert _parse_tasks() == [
164
+ TaskName.REGISTRY_CORRUPTION,
165
+ TaskName.JOB_GENERATOR_RUNAWAY,
166
+ ]
167
+ finally:
168
+ if previous is None:
169
+ os.environ.pop("TASKS_CSV", None)
170
+ else:
171
+ os.environ["TASKS_CSV"] = previous
172
+
173
+
174
+
175
+ def test_episode_score_clamps_terminal_reward_to_unit_interval() -> None:
176
+ assert _episode_score([]) == 0.0
177
+ assert _episode_score([0.2, 0.8]) == 0.8
178
+ assert _episode_score([1.2]) == 1.0
179
+ assert _episode_score([-0.1]) == 0.0
180
+
181
+
182
+ def test_end_log_line_includes_score_and_reward_list() -> None:
183
+ line = _format_end_line(success=True, steps=3, score=0.987, rewards=[0.0, 0.125, 1.0])
184
+ assert line == (
185
+ "[END] success=true steps=3 score=0.99 rewards=0.00,0.12,1.00"
186
+ )
tests/test_models.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import ValidationError
2
+
3
+ from server.models import Action, Observation, SystemMetrics
4
+
5
+
6
+ def test_system_metrics_rejects_success_rate_above_one() -> None:
7
+ try:
8
+ SystemMetrics(
9
+ gateway_success_rate=1.2,
10
+ gateway_p99_latency_ms=20,
11
+ queue_depth=0,
12
+ worker_restart_count=0,
13
+ consumer_stall_count=0,
14
+ )
15
+ except ValidationError:
16
+ return
17
+ raise AssertionError("Expected ValidationError for success rate > 1.0")
18
+
19
+
20
+ def test_observation_roundtrip() -> None:
21
+ original = Observation(
22
+ command_output="ok",
23
+ metrics=SystemMetrics(
24
+ gateway_success_rate=0.7,
25
+ gateway_p99_latency_ms=123,
26
+ queue_depth=3,
27
+ worker_restart_count=1,
28
+ consumer_stall_count=2,
29
+ ),
30
+ process_status={"gateway": "running pid=42"},
31
+ )
32
+
33
+ restored = Observation.model_validate_json(original.model_dump_json())
34
+ assert restored == original
35
+
36
+
37
+ def test_action_rejects_empty_command() -> None:
38
+ try:
39
+ Action(command=" ")
40
+ except ValidationError:
41
+ return
42
+ raise AssertionError("Expected ValidationError for empty command")
uv.lock ADDED
The diff for this file is too large to render. See raw diff