chore: deploy distributed systems debug environment
Browse files- .gitignore +36 -0
- .pytest_cache/.gitignore +2 -0
- .pytest_cache/CACHEDIR.TAG +4 -0
- .pytest_cache/README.md +8 -0
- .pytest_cache/v/cache/nodeids +34 -0
- Dockerfile +40 -0
- README.md +242 -7
- inference.py +418 -0
- mesh/auth/bun.lock +19 -0
- mesh/auth/config.json +3 -0
- mesh/auth/index.ts +83 -0
- mesh/auth/package.json +10 -0
- mesh/auth/tsconfig.json +10 -0
- mesh/gateway/blocked_routes.json +3 -0
- mesh/gateway/bun.lock +44 -0
- mesh/gateway/config.json +3 -0
- mesh/gateway/index.ts +213 -0
- mesh/gateway/package.json +13 -0
- mesh/gateway/tsconfig.json +10 -0
- mesh/registry.json +19 -0
- mesh/worker/bun.lock +44 -0
- mesh/worker/config.json +4 -0
- mesh/worker/index.ts +198 -0
- mesh/worker/job_generator.ts +94 -0
- mesh/worker/job_generator_config.json +3 -0
- mesh/worker/package.json +14 -0
- mesh/worker/tsconfig.json +10 -0
- openenv.yaml +86 -0
- pyproject.toml +32 -0
- requirements.txt +7 -0
- server/__init__.py +0 -0
- server/api.py +63 -0
- server/app.py +11 -0
- server/constants.py +54 -0
- server/env.py +467 -0
- server/fault_injector.py +95 -0
- server/graders.py +109 -0
- server/metrics_poller.py +96 -0
- server/models.py +41 -0
- server/process_manager.py +164 -0
- setup-dev.sh +86 -0
- start.sh +50 -0
- tests/test_api_reset.py +65 -0
- tests/test_env_task_context.py +57 -0
- tests/test_graders.py +132 -0
- tests/test_inference_format.py +186 -0
- tests/test_models.py +42 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
docs/
|
| 2 |
+
|
| 3 |
+
# Python
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*.pyo
|
| 7 |
+
.venv/
|
| 8 |
+
venv/
|
| 9 |
+
.env
|
| 10 |
+
*.egg-info/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
|
| 14 |
+
# Bun / Node
|
| 15 |
+
node_modules/
|
| 16 |
+
bun.lockb
|
| 17 |
+
|
| 18 |
+
# SQLite
|
| 19 |
+
*.db
|
| 20 |
+
*.sqlite
|
| 21 |
+
*.sqlite3
|
| 22 |
+
|
| 23 |
+
# Logs
|
| 24 |
+
*.log
|
| 25 |
+
/tmp/
|
| 26 |
+
|
| 27 |
+
# OS
|
| 28 |
+
.DS_Store
|
| 29 |
+
Thumbs.db
|
| 30 |
+
|
| 31 |
+
# IDE
|
| 32 |
+
.cursor/
|
| 33 |
+
.vscode/
|
| 34 |
+
*.swp
|
| 35 |
+
|
| 36 |
+
inference.out
|
.pytest_cache/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Created by pytest automatically.
|
| 2 |
+
*
|
.pytest_cache/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
.pytest_cache/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
.pytest_cache/v/cache/nodeids
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
"tests/test_api_reset.py::test_reset_defaults_to_cascading_timeout_when_task_missing",
|
| 3 |
+
"tests/test_api_reset.py::test_reset_rejects_unknown_explicit_task",
|
| 4 |
+
"tests/test_env_task_context.py::test_job_generator_rate_resolved_uses_config",
|
| 5 |
+
"tests/test_env_task_context.py::test_registry_auth_matches_default_detects_corruption",
|
| 6 |
+
"tests/test_env_task_context.py::test_restore_defaults_adds_job_generator_config",
|
| 7 |
+
"tests/test_graders.py::test_grade_backpressure_cascade_continuous",
|
| 8 |
+
"tests/test_graders.py::test_grade_byzantine_queue_fault_cases",
|
| 9 |
+
"tests/test_graders.py::test_grade_cascading_timeout_boundaries",
|
| 10 |
+
"tests/test_graders.py::test_grade_distributed_lock_starvation_cases",
|
| 11 |
+
"tests/test_graders.py::test_grade_job_generator_runaway_thresholds",
|
| 12 |
+
"tests/test_graders.py::test_grade_registry_corruption_thresholds",
|
| 13 |
+
"tests/test_graders.py::test_grade_route_partition_threshold",
|
| 14 |
+
"tests/test_inference_format.py::test_attempt_history_block_renders_all_attempts",
|
| 15 |
+
"tests/test_inference_format.py::test_build_prompt_contains_symptoms_and_history",
|
| 16 |
+
"tests/test_inference_format.py::test_end_log_line_includes_score_and_reward_list",
|
| 17 |
+
"tests/test_inference_format.py::test_episode_score_clamps_terminal_reward_to_unit_interval",
|
| 18 |
+
"tests/test_inference_format.py::test_extract_command_prefers_first_json_object_with_command",
|
| 19 |
+
"tests/test_inference_format.py::test_extract_command_reads_fenced_json_payload",
|
| 20 |
+
"tests/test_inference_format.py::test_extract_command_reads_json_after_reasoning_preamble",
|
| 21 |
+
"tests/test_inference_format.py::test_extract_command_reads_json_embedded_in_text",
|
| 22 |
+
"tests/test_inference_format.py::test_extract_command_reads_json_payload",
|
| 23 |
+
"tests/test_inference_format.py::test_extract_command_rejects_non_json_code_fence",
|
| 24 |
+
"tests/test_inference_format.py::test_extract_command_requires_command_even_with_reasoning",
|
| 25 |
+
"tests/test_inference_format.py::test_extract_command_returns_none_when_empty",
|
| 26 |
+
"tests/test_inference_format.py::test_extract_reasoning_when_present",
|
| 27 |
+
"tests/test_inference_format.py::test_parse_tasks_default_and_override",
|
| 28 |
+
"tests/test_inference_format.py::test_single_line_removes_newlines",
|
| 29 |
+
"tests/test_inference_format.py::test_task_symptom_block_includes_new_tasks",
|
| 30 |
+
"tests/test_inference_format.py::test_task_symptom_block_is_non_empty",
|
| 31 |
+
"tests/test_models.py::test_action_rejects_empty_command",
|
| 32 |
+
"tests/test_models.py::test_observation_roundtrip",
|
| 33 |
+
"tests/test_models.py::test_system_metrics_rejects_success_rate_above_one"
|
| 34 |
+
]
|
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 4 |
+
curl \
|
| 5 |
+
redis-server \
|
| 6 |
+
jq \
|
| 7 |
+
procps \
|
| 8 |
+
sqlite3 \
|
| 9 |
+
unzip \
|
| 10 |
+
ca-certificates \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
RUN useradd -m -u 1000 user
|
| 14 |
+
WORKDIR /home/user/app
|
| 15 |
+
RUN mkdir -p /mesh && chown -R user:user /home/user /mesh
|
| 16 |
+
USER user
|
| 17 |
+
|
| 18 |
+
ENV HOME=/home/user
|
| 19 |
+
ENV PATH="/home/user/.bun/bin:/home/user/.local/bin:${PATH}"
|
| 20 |
+
|
| 21 |
+
RUN curl -fsSL https://bun.sh/install | bash
|
| 22 |
+
|
| 23 |
+
COPY --chown=user:user mesh/ ./mesh/
|
| 24 |
+
RUN cd mesh/gateway && bun install
|
| 25 |
+
RUN cd mesh/auth && bun install
|
| 26 |
+
RUN cd mesh/worker && bun install
|
| 27 |
+
|
| 28 |
+
COPY --chown=user:user requirements.txt ./
|
| 29 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 30 |
+
|
| 31 |
+
COPY --chown=user:user *.py ./
|
| 32 |
+
COPY --chown=user:user server/ ./server/
|
| 33 |
+
COPY --chown=user:user openenv.yaml ./
|
| 34 |
+
COPY --chown=user:user start.sh ./
|
| 35 |
+
RUN chmod +x ./start.sh
|
| 36 |
+
|
| 37 |
+
RUN ln -sfn /home/user/app/mesh /mesh
|
| 38 |
+
|
| 39 |
+
EXPOSE 8000
|
| 40 |
+
CMD ["./start.sh"]
|
README.md
CHANGED
|
@@ -1,9 +1,244 @@
|
|
| 1 |
-
|
| 2 |
-
title: openenv-distributed-systems-debugging
|
| 3 |
-
sdk: docker
|
| 4 |
-
short_description: OpenEnv RL env for debugging distributed systems failures.
|
| 5 |
-
---
|
| 6 |
|
| 7 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
OpenEnv-compatible RL environment for debugging distributed systems failures.
|
|
|
|
| 1 |
+
# Distributed Systems Debug Environment
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
## Overview
|
| 4 |
+
This project provides an OpenEnv-compatible RL environment for debugging distributed systems failures.
|
| 5 |
+
|
| 6 |
+
The environment simulates a production-style pipeline:
|
| 7 |
+
|
| 8 |
+
- Gateway service (sync HTTP orchestration)
|
| 9 |
+
- Auth service (sync dependency)
|
| 10 |
+
- Redis queue (message bus)
|
| 11 |
+
- Worker service (async consumer + lock handling)
|
| 12 |
+
- SQLite sink (persistence simulation)
|
| 13 |
+
|
| 14 |
+
An agent interacts only through shell commands and must diagnose/fix injected faults.
|
| 15 |
+
|
| 16 |
+
## Why this environment
|
| 17 |
+
Most RL environments focus on games or synthetic workflows. This one targets some bugs that I have faced personally at my job focussing on debugging skills used in real systems engineering:
|
| 18 |
+
|
| 19 |
+
- reading logs under uncertainty
|
| 20 |
+
- triaging latency and queue symptoms
|
| 21 |
+
- fixing misconfigurations safely
|
| 22 |
+
- validating recovery from metrics
|
| 23 |
+
|
| 24 |
+
## Architecture
|
| 25 |
+
```
|
| 26 |
+
Agent command -> /step (FastAPI)
|
| 27 |
+
|
|
| 28 |
+
+-> executes shell command (sandboxed, 10s timeout)
|
| 29 |
+
+-> polls metrics
|
| 30 |
+
+-> grades progress
|
| 31 |
+
|
| 32 |
+
Services (same container):
|
| 33 |
+
gateway:3000 -> auth:3001 -> redis:6379 -> worker -> sqlite
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Observation Space
|
| 37 |
+
| Field | Type | Description |
|
| 38 |
+
|---|---|---|
|
| 39 |
+
| `command_output` | string | stdout+stderr of last command |
|
| 40 |
+
| `metrics.gateway_success_rate` | float [0,1] | rolling gateway success rate |
|
| 41 |
+
| `metrics.gateway_p99_latency_ms` | float | rolling p99 latency |
|
| 42 |
+
| `metrics.queue_depth` | int | Redis queue depth |
|
| 43 |
+
| `metrics.worker_restart_count` | int | simulated crash-loop count |
|
| 44 |
+
| `metrics.consumer_stall_count` | int | lock-starvation stall count |
|
| 45 |
+
| `process_status` | object | runtime status by service |
|
| 46 |
+
|
| 47 |
+
## Action Space
|
| 48 |
+
Single command action:
|
| 49 |
+
|
| 50 |
+
```json
|
| 51 |
+
{ "command": "<bash command>" }
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
Examples:
|
| 55 |
+
- `tail -20 /tmp/worker.log`
|
| 56 |
+
- `redis-cli DEL LOCK:job_processor`
|
| 57 |
+
- `cat /mesh/gateway/blocked_routes.json`
|
| 58 |
+
- `kill -HUP $(cat /tmp/worker.pid)`
|
| 59 |
+
|
| 60 |
+
## Tasks
|
| 61 |
+
| Task | Difficulty | Goal |
|
| 62 |
+
|---|---|---|
|
| 63 |
+
| `cascading-timeout` | easy | restore successful sync flow (auth delay vs gateway timeout) |
|
| 64 |
+
| `byzantine-queue-fault` | medium | remove poison message and stabilize worker |
|
| 65 |
+
| `distributed-lock-starvation` | hard | clear stale lock and resume consumption |
|
| 66 |
+
| `backpressure-cascade` | hard | recover throughput and reduce queue growth |
|
| 67 |
+
| `route-partition` | hard | unblock gateway->redis route policy |
|
| 68 |
+
| `registry-corruption` | medium | repair corrupted auth registry entry and restore request flow |
|
| 69 |
+
| `job-generator-runaway` | hard | reduce enqueue pressure so the queue drains sustainably |
|
| 70 |
+
|
| 71 |
+
## Reward Function
|
| 72 |
+
- Terminal reward: `1.0` when grader score >= `0.95`
|
| 73 |
+
- Dense shaping from grader progress + investigation command bonus + metric improvements
|
| 74 |
+
- Penalties for blocked/damaging actions and repeated non-productive behavior
|
| 75 |
+
- Reward clamped to `[0.0, 1.0]`
|
| 76 |
+
|
| 77 |
+
## Baseline Inference policy (3 of 7 by default)
|
| 78 |
+
All seven tasks are implemented in the environment.
|
| 79 |
+
|
| 80 |
+
`inference.py` runs these default tasks for runtime reliability:
|
| 81 |
+
|
| 82 |
+
1. `cascading-timeout` (easy)
|
| 83 |
+
2. `byzantine-queue-fault` (medium)
|
| 84 |
+
3. `distributed-lock-starvation` (hard)
|
| 85 |
+
|
| 86 |
+
Override with:
|
| 87 |
+
|
| 88 |
+
```bash
|
| 89 |
+
TASKS_CSV=cascading-timeout,route-partition python inference.py
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
## Setup
|
| 93 |
+
### Local
|
| 94 |
+
```bash
|
| 95 |
+
python3.12 -m venv .venv
|
| 96 |
+
. .venv/bin/activate
|
| 97 |
+
pip install -r requirements.txt
|
| 98 |
+
|
| 99 |
+
bun install --cwd mesh/gateway
|
| 100 |
+
bun install --cwd mesh/auth
|
| 101 |
+
bun install --cwd mesh/worker
|
| 102 |
+
|
| 103 |
+
APP_ROOT=$(pwd) MESH_ROOT=$(pwd)/mesh ./start.sh
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Docker
|
| 107 |
+
```bash
|
| 108 |
+
docker build -t dist-debug-env .
|
| 109 |
+
docker run -p 8000:8000 dist-debug-env
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
### API smoke check
|
| 113 |
+
```bash
|
| 114 |
+
curl http://localhost:8000/health
|
| 115 |
+
curl -X POST "http://localhost:8000/reset?task_name=cascading-timeout"
|
| 116 |
+
curl -X POST http://localhost:8000/step \
|
| 117 |
+
-H "Content-Type: application/json" \
|
| 118 |
+
-d '{"command":"ls /tmp"}'
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
## Inference script contract
|
| 122 |
+
`inference.py` emits strict logs:
|
| 123 |
+
|
| 124 |
+
```text
|
| 125 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 126 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 127 |
+
[END] success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...,rn>
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## Logging
|
| 131 |
+
Service logs (JSON-lines):
|
| 132 |
+
- `/tmp/gateway.log`
|
| 133 |
+
- `/tmp/auth.log`
|
| 134 |
+
- `/tmp/worker.log`
|
| 135 |
+
|
| 136 |
+
Common fields:
|
| 137 |
+
- `ts`, `level`, `service`, `event`, `pattern`
|
| 138 |
+
|
| 139 |
+
Example investigation commands:
|
| 140 |
+
```bash
|
| 141 |
+
tail -30 /tmp/worker.log
|
| 142 |
+
jq 'select(.level=="ERROR")' /tmp/worker.log
|
| 143 |
+
redis-cli LLEN job_queue
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
## Baseline scores
|
| 147 |
+
Baseline scores depend on endpoint/model latency and quality. Reproduce with:
|
| 148 |
+
|
| 149 |
+
```bash
|
| 150 |
+
HF_TOKEN=<token> API_BASE_URL=<endpoint> MODEL_NAME=<model> python inference.py
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
## Run this locally
|
| 155 |
+
Use this checklist when running the full baseline end-to-end on your machine.
|
| 156 |
+
|
| 157 |
+
1. Install dependencies and validate project setup:
|
| 158 |
+
```bash
|
| 159 |
+
./setup-dev.sh
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
2. Activate the project virtual environment (required so `uvicorn` and Python deps are on PATH):
|
| 163 |
+
```bash
|
| 164 |
+
source .venv/bin/activate
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
3. Start the environment API (leave this terminal running):
|
| 168 |
+
```bash
|
| 169 |
+
APP_ROOT=$(pwd) MESH_ROOT=$(pwd)/mesh ./start.sh
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
4. In another terminal, activate venv again and export required inference variables:
|
| 173 |
+
```bash
|
| 174 |
+
source .venv/bin/activate
|
| 175 |
+
export API_BASE_URL="https://openrouter.ai/api/v1"
|
| 176 |
+
export MODEL_NAME="<your-model>"
|
| 177 |
+
export HF_TOKEN="<your-api-key>"
|
| 178 |
+
|
| 179 |
+
# Optional override; default already runs 3 baseline tasks
|
| 180 |
+
export TASKS_CSV="cascading-timeout,byzantine-queue-fault,distributed-lock-starvation"
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
If you have a .env file you can set the variables from the file via this command
|
| 184 |
+
|
| 185 |
+
```bash
|
| 186 |
+
set -a
|
| 187 |
+
source .env
|
| 188 |
+
set +a
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
5. Run inference with a 20 minute cap and capture output:
|
| 192 |
+
```bash
|
| 193 |
+
# macOS (coreutils): gtimeout ; Linux: timeout
|
| 194 |
+
gtimeout 1200 python inference.py | tee inference.out
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
6. Validate structured stdout format quickly:
|
| 198 |
+
```bash
|
| 199 |
+
python - <<'PY'
|
| 200 |
+
import re, sys
|
| 201 |
+
from pathlib import Path
|
| 202 |
+
|
| 203 |
+
lines = Path("inference.out").read_text(encoding="utf-8").splitlines()
|
| 204 |
+
if not lines:
|
| 205 |
+
print("FAIL: no output")
|
| 206 |
+
raise SystemExit(1)
|
| 207 |
+
|
| 208 |
+
start_re = re.compile(r'^\[START\] task=\S+ env=\S+ model=.+$')
|
| 209 |
+
step_re = re.compile(r'^\[STEP\]\s{2}step=\d+ action=.* reward=\d+\.\d{2} done=(true|false) error=.*$')
|
| 210 |
+
end_re = re.compile(r'^\[END\]\s{3}success=(true|false) steps=\d+ score=\d+\.\d{2} rewards=.*$')
|
| 211 |
+
|
| 212 |
+
for i, line in enumerate(lines, 1):
|
| 213 |
+
if line.startswith("[START]") and not start_re.match(line):
|
| 214 |
+
print(f"FAIL: bad START line {i}: {line}")
|
| 215 |
+
raise SystemExit(1)
|
| 216 |
+
if line.startswith("[STEP]") and not step_re.match(line):
|
| 217 |
+
print(f"FAIL: bad STEP line {i}: {line}")
|
| 218 |
+
raise SystemExit(1)
|
| 219 |
+
if line.startswith("[END]") and not end_re.match(line):
|
| 220 |
+
print(f"FAIL: bad END line {i}: {line}")
|
| 221 |
+
raise SystemExit(1)
|
| 222 |
+
|
| 223 |
+
print("PASS: stdout format valid")
|
| 224 |
+
PY
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
7. Re-run required submission gates:
|
| 228 |
+
```bash
|
| 229 |
+
openenv validate .
|
| 230 |
+
docker build -t dist-debug-env:local .
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
## Benchmarks b/w Models
|
| 238 |
+
|
| 239 |
+
### 3 Tasks Benchmark
|
| 240 |
+
<img width="1177" height="752" alt="Screenshot 2026-04-04 at 11 54 25 PM" src="https://github.com/user-attachments/assets/3dbfa87a-6696-4589-a908-baa3f498bda8" />
|
| 241 |
+
|
| 242 |
+
### 7 Task Benchmark
|
| 243 |
+
<img width="1294" height="240" alt="Screenshot 2026-04-05 at 12 30 45 AM" src="https://github.com/user-attachments/assets/1d0d3847-212e-46ba-967f-f79be3f9067c" />
|
| 244 |
|
|
|
inference.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from typing import Any, Iterator
|
| 5 |
+
|
| 6 |
+
import httpx
|
| 7 |
+
from server.constants import (
|
| 8 |
+
DEFAULT_BASELINE_TASK_ENUMS,
|
| 9 |
+
NO_COMMAND_PROVIDED_SENTINEL,
|
| 10 |
+
TASK_MAX_STEPS,
|
| 11 |
+
TaskName,
|
| 12 |
+
)
|
| 13 |
+
from server.models import Action, Observation, StepResult
|
| 14 |
+
|
| 15 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 16 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 17 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 18 |
+
ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
|
| 19 |
+
BENCHMARK = "distributed-systems-debug-env"
|
| 20 |
+
MAX_STEPS_CAP = int(os.getenv("MAX_STEPS", "0"))
|
| 21 |
+
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
|
| 22 |
+
MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "2048"))
|
| 23 |
+
_JSON_DECODER = json.JSONDecoder()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _chat_token_limit_kwargs() -> dict[str, int]:
|
| 27 |
+
"""OpenAI `gpt-5.*` / some models require `max_completion_tokens`, not `max_tokens`."""
|
| 28 |
+
override = os.getenv("CHAT_TOKEN_LIMIT_PARAM", "").strip().lower()
|
| 29 |
+
if override == "max_tokens":
|
| 30 |
+
return {"max_tokens": MAX_COMPLETION_TOKENS}
|
| 31 |
+
if override == "max_completion_tokens":
|
| 32 |
+
return {"max_completion_tokens": MAX_COMPLETION_TOKENS}
|
| 33 |
+
base = API_BASE_URL or ""
|
| 34 |
+
if "api.openai.com" in base:
|
| 35 |
+
return {"max_completion_tokens": MAX_COMPLETION_TOKENS}
|
| 36 |
+
return {"max_tokens": MAX_COMPLETION_TOKENS}
|
| 37 |
+
|
| 38 |
+
SYSTEM_PROMPT = """You have bash access to a distributed job processing pipeline that is experiencing a failure.
|
| 39 |
+
Use bash commands to investigate system behavior and narrow down likely fault conditions.
|
| 40 |
+
Standard Unix tools are available: ps, ls, cat, grep, tail, curl, jq, redis-cli, kill, sed.
|
| 41 |
+
Work iteratively across multiple steps; each response must provide the next bash command only.
|
| 42 |
+
Respond with compact JSON where `command` is required: {"command":"<bash command>","reasoning":"optional concise reason"}.
|
| 43 |
+
No markdown. No explanation outside JSON."""
|
| 44 |
+
|
| 45 |
+
TASK_SYMPTOMS: dict[TaskName, tuple[str, ...]] = {
|
| 46 |
+
TaskName.CASCADING_TIMEOUT: (
|
| 47 |
+
"Requests intermittently fail even when services appear up.",
|
| 48 |
+
"Latency spikes sharply during traffic bursts.",
|
| 49 |
+
),
|
| 50 |
+
TaskName.BYZANTINE_QUEUE_FAULT: (
|
| 51 |
+
"Worker throughput degrades after specific jobs enter the queue.",
|
| 52 |
+
"Queue backlog grows despite workers being alive.",
|
| 53 |
+
),
|
| 54 |
+
TaskName.DISTRIBUTED_LOCK_STARVATION: (
|
| 55 |
+
"One or more workers appear blocked for extended periods.",
|
| 56 |
+
"Work completion remains low without full service outage.",
|
| 57 |
+
),
|
| 58 |
+
TaskName.BACKPRESSURE_CASCADE: (
|
| 59 |
+
"Queue depth trends upward over time under steady load.",
|
| 60 |
+
),
|
| 61 |
+
TaskName.ROUTE_PARTITION: (
|
| 62 |
+
"Gateway requests intermittently fail despite local process health.",
|
| 63 |
+
"Signals point to a connectivity path issue rather than a full service outage.",
|
| 64 |
+
),
|
| 65 |
+
TaskName.REGISTRY_CORRUPTION: (
|
| 66 |
+
"Gateway requests fail even though the gateway process is still healthy.",
|
| 67 |
+
"Logs and config inspection suggest a bad upstream registry entry.",
|
| 68 |
+
),
|
| 69 |
+
TaskName.JOB_GENERATOR_RUNAWAY: (
|
| 70 |
+
"Queue backlog grows while the worker stays alive.",
|
| 71 |
+
"Producer pressure appears higher than the system can sustainably drain.",
|
| 72 |
+
),
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class DistributedDebugEnvClient:
|
| 77 |
+
def __init__(self, base_url: str) -> None:
|
| 78 |
+
self._client = httpx.Client(base_url=base_url, timeout=45.0)
|
| 79 |
+
|
| 80 |
+
def close(self) -> None:
|
| 81 |
+
self._client.close()
|
| 82 |
+
|
| 83 |
+
def reset(self, task_name: str) -> Observation:
|
| 84 |
+
response = self._client.post("/reset", params={"task_name": task_name})
|
| 85 |
+
response.raise_for_status()
|
| 86 |
+
return Observation.model_validate(response.json())
|
| 87 |
+
|
| 88 |
+
def step(self, action: Action) -> StepResult:
|
| 89 |
+
response = self._client.post("/step", json=action.model_dump())
|
| 90 |
+
response.raise_for_status()
|
| 91 |
+
return StepResult.model_validate(response.json())
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _parse_tasks() -> list[TaskName]:
|
| 95 |
+
csv = os.getenv("TASKS_CSV", "").strip()
|
| 96 |
+
if not csv:
|
| 97 |
+
return list(DEFAULT_BASELINE_TASK_ENUMS)
|
| 98 |
+
|
| 99 |
+
tasks: list[TaskName] = []
|
| 100 |
+
for value in csv.split(","):
|
| 101 |
+
task_str = value.strip()
|
| 102 |
+
if not task_str:
|
| 103 |
+
continue
|
| 104 |
+
tasks.append(TaskName.parse(task_str))
|
| 105 |
+
|
| 106 |
+
return tasks
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _bool(value: bool) -> str:
|
| 110 |
+
return "true" if value else "false"
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _single_line(text: str) -> str:
|
| 114 |
+
return " ".join(text.replace("\t", " ").splitlines()).strip()
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _command_from_dict(payload: dict[str, Any]) -> tuple[str | None, str | None]:
|
| 118 |
+
command_value = payload.get("command")
|
| 119 |
+
command = command_value.strip() if isinstance(command_value, str) else ""
|
| 120 |
+
if not command:
|
| 121 |
+
return None, None
|
| 122 |
+
|
| 123 |
+
reasoning_value = payload.get("reasoning")
|
| 124 |
+
reasoning = reasoning_value.strip() if isinstance(reasoning_value, str) else ""
|
| 125 |
+
return command, (reasoning or None)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _parse_action_payload(text: str) -> tuple[str | None, str | None]:
|
| 129 |
+
try:
|
| 130 |
+
payload = json.loads(text)
|
| 131 |
+
except json.JSONDecodeError:
|
| 132 |
+
return None, None
|
| 133 |
+
|
| 134 |
+
if not isinstance(payload, dict):
|
| 135 |
+
return None, None
|
| 136 |
+
|
| 137 |
+
return _command_from_dict(payload)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def _iter_decoded_json_objects(text: str) -> Iterator[Any]:
|
| 141 |
+
i = 0
|
| 142 |
+
while i < len(text):
|
| 143 |
+
if text[i] != "{":
|
| 144 |
+
i += 1
|
| 145 |
+
continue
|
| 146 |
+
try:
|
| 147 |
+
obj, end = _JSON_DECODER.raw_decode(text, i)
|
| 148 |
+
except json.JSONDecodeError:
|
| 149 |
+
i += 1
|
| 150 |
+
continue
|
| 151 |
+
yield obj
|
| 152 |
+
i = end
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _assistant_message_text(message: Any) -> str:
|
| 156 |
+
content = getattr(message, "content", None)
|
| 157 |
+
if content is None:
|
| 158 |
+
return ""
|
| 159 |
+
if isinstance(content, str):
|
| 160 |
+
return content
|
| 161 |
+
if isinstance(content, list):
|
| 162 |
+
parts: list[str] = []
|
| 163 |
+
for part in content:
|
| 164 |
+
if isinstance(part, dict):
|
| 165 |
+
text_val = part.get("text")
|
| 166 |
+
if text_val is not None:
|
| 167 |
+
parts.append(str(text_val))
|
| 168 |
+
else:
|
| 169 |
+
text_attr = getattr(part, "text", None)
|
| 170 |
+
if text_attr is not None:
|
| 171 |
+
parts.append(str(text_attr))
|
| 172 |
+
return "\n".join(parts)
|
| 173 |
+
return str(content)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def extract_action_payload(llm_response: str) -> tuple[str | None, str | None]:
|
| 177 |
+
response = llm_response.strip()
|
| 178 |
+
if not response:
|
| 179 |
+
return None, None
|
| 180 |
+
|
| 181 |
+
if response.startswith("```"):
|
| 182 |
+
lines = response.split("\n")
|
| 183 |
+
if len(lines) > 2:
|
| 184 |
+
response = "\n".join(lines[1:-1]).strip()
|
| 185 |
+
|
| 186 |
+
direct_command, direct_reasoning = _parse_action_payload(response)
|
| 187 |
+
if direct_command:
|
| 188 |
+
return direct_command, direct_reasoning
|
| 189 |
+
|
| 190 |
+
for obj in _iter_decoded_json_objects(response):
|
| 191 |
+
if isinstance(obj, dict):
|
| 192 |
+
embedded_command, embedded_reasoning = _command_from_dict(obj)
|
| 193 |
+
if embedded_command:
|
| 194 |
+
return embedded_command, embedded_reasoning
|
| 195 |
+
|
| 196 |
+
for match in re.finditer(r"\{[^{}]*\}", response, flags=re.DOTALL):
|
| 197 |
+
embedded_command, embedded_reasoning = _parse_action_payload(match.group(0))
|
| 198 |
+
if embedded_command:
|
| 199 |
+
return embedded_command, embedded_reasoning
|
| 200 |
+
|
| 201 |
+
first_line = response.split("\n")[0].strip()
|
| 202 |
+
return _parse_action_payload(first_line)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def extract_command(llm_response: str) -> str | None:
|
| 206 |
+
return extract_action_payload(llm_response)[0]
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def extract_reasoning(llm_response: str) -> str | None:
|
| 210 |
+
return extract_action_payload(llm_response)[1]
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _sanitize_reasoning_for_step(reasoning: str) -> str:
|
| 214 |
+
sanitized = _single_line(reasoning)
|
| 215 |
+
sanitized = sanitized.replace(" reward=", " reward:")
|
| 216 |
+
sanitized = sanitized.replace(" done=", " done:")
|
| 217 |
+
sanitized = sanitized.replace(" error=", " error:")
|
| 218 |
+
return sanitized[:160]
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def _format_step_action(command: str, reasoning: str | None) -> str:
|
| 222 |
+
action = _single_line(command)
|
| 223 |
+
if not reasoning:
|
| 224 |
+
return action
|
| 225 |
+
|
| 226 |
+
sanitized_reasoning = _sanitize_reasoning_for_step(reasoning)
|
| 227 |
+
if not sanitized_reasoning:
|
| 228 |
+
return action
|
| 229 |
+
return f"{action} | reasoning={sanitized_reasoning}"
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _episode_score(rewards: list[float]) -> float:
|
| 233 |
+
# Score is terminal task progress signal and must stay normalized for evaluator checks.
|
| 234 |
+
if not rewards:
|
| 235 |
+
return 0.0
|
| 236 |
+
return max(0.0, min(1.0, float(rewards[-1])))
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def _format_end_line(
|
| 240 |
+
*, success: bool, steps: int, score: float, rewards: list[float]
|
| 241 |
+
) -> str:
|
| 242 |
+
rewards_csv = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 243 |
+
return (
|
| 244 |
+
f"[END] success={_bool(success)} steps={steps} "
|
| 245 |
+
f"score={score:.2f} rewards={rewards_csv}"
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
def _task_symptom_block(task_name: TaskName) -> str:
|
| 249 |
+
return "\n".join(f"- {symptom}" for symptom in TASK_SYMPTOMS[task_name])
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _attempt_history_block(attempt_history: list[dict[str, Any]]) -> str:
|
| 253 |
+
if not attempt_history:
|
| 254 |
+
return "- none"
|
| 255 |
+
|
| 256 |
+
lines: list[str] = []
|
| 257 |
+
for attempt in attempt_history:
|
| 258 |
+
command = _single_line(str(attempt["command"]))[:120]
|
| 259 |
+
reasoning = _single_line(str(attempt.get("reasoning") or ""))[:120]
|
| 260 |
+
output_preview = _single_line(str(attempt.get("output") or ""))[:140]
|
| 261 |
+
error = attempt.get("error")
|
| 262 |
+
error_text = _single_line(str(error))[:80] if error else "none"
|
| 263 |
+
line = f"- step {attempt['step']}: command={command}; error={error_text}"
|
| 264 |
+
if reasoning:
|
| 265 |
+
line = f"{line}; reasoning={reasoning}"
|
| 266 |
+
if output_preview:
|
| 267 |
+
line = f"{line}; output={output_preview}"
|
| 268 |
+
lines.append(line)
|
| 269 |
+
|
| 270 |
+
return "\n".join(lines)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def build_prompt(
|
| 274 |
+
obs: Observation,
|
| 275 |
+
step_num: int,
|
| 276 |
+
task_name: TaskName,
|
| 277 |
+
attempt_history: list[dict[str, Any]],
|
| 278 |
+
) -> str:
|
| 279 |
+
return (
|
| 280 |
+
f"Step {step_num}. Current system state:\n\n"
|
| 281 |
+
"TASK SYMPTOMS:\n"
|
| 282 |
+
f"{_task_symptom_block(task_name)}\n\n"
|
| 283 |
+
"PREVIOUS ATTEMPTS:\n"
|
| 284 |
+
f"{_attempt_history_block(attempt_history)}\n\n"
|
| 285 |
+
"METRICS:\n"
|
| 286 |
+
f"- Gateway success rate: {obs.metrics.gateway_success_rate:.1%}\n"
|
| 287 |
+
f"- Gateway P99 latency: {obs.metrics.gateway_p99_latency_ms:.0f}ms\n"
|
| 288 |
+
f"- Queue depth: {obs.metrics.queue_depth}\n"
|
| 289 |
+
f"- Worker restarts: {obs.metrics.worker_restart_count}\n"
|
| 290 |
+
f"- Consumer stall count: {obs.metrics.consumer_stall_count}\n\n"
|
| 291 |
+
"SERVICE STATUS:\n"
|
| 292 |
+
f"{obs.process_status}\n\n"
|
| 293 |
+
"LATEST COMMAND OUTPUT:\n"
|
| 294 |
+
f"{obs.command_output[:2000]}\n\n"
|
| 295 |
+
"Solve this over multiple steps as needed. For this step, return only the single next bash command.\n"
|
| 296 |
+
'Respond with compact JSON where command is required: {"command":"<bash command>","reasoning":"optional concise reason"}.'
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _run_episode(
|
| 301 |
+
client: Any, env: DistributedDebugEnvClient, task_name: TaskName
|
| 302 |
+
) -> None:
|
| 303 |
+
messages: list[dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 304 |
+
rewards: list[float] = []
|
| 305 |
+
|
| 306 |
+
done = False
|
| 307 |
+
step = 0
|
| 308 |
+
last_error: str | None = None
|
| 309 |
+
attempt_history: list[dict[str, Any]] = []
|
| 310 |
+
|
| 311 |
+
print(
|
| 312 |
+
f"[START] task={task_name.value} env={BENCHMARK} model={MODEL_NAME}", flush=True
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
task_budget = TASK_MAX_STEPS[task_name]
|
| 316 |
+
max_steps = min(task_budget, MAX_STEPS_CAP) if MAX_STEPS_CAP > 0 else task_budget
|
| 317 |
+
try:
|
| 318 |
+
obs = env.reset(task_name=task_name.value)
|
| 319 |
+
while not done and step < max_steps:
|
| 320 |
+
next_step = step + 1
|
| 321 |
+
user_prompt = build_prompt(obs, next_step, task_name, attempt_history)
|
| 322 |
+
messages.append({"role": "user", "content": user_prompt})
|
| 323 |
+
|
| 324 |
+
completion = client.chat.completions.create(
|
| 325 |
+
model=MODEL_NAME,
|
| 326 |
+
messages=messages,
|
| 327 |
+
temperature=TEMPERATURE,
|
| 328 |
+
**_chat_token_limit_kwargs(),
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
raw_response = _assistant_message_text(completion.choices[0].message)
|
| 332 |
+
command, reasoning = extract_action_payload(raw_response)
|
| 333 |
+
if not command:
|
| 334 |
+
messages.append({"role": "assistant", "content": raw_response})
|
| 335 |
+
messages.append(
|
| 336 |
+
{
|
| 337 |
+
"role": "user",
|
| 338 |
+
"content": (
|
| 339 |
+
"No command was provided. Respond with compact JSON where command is required: "
|
| 340 |
+
'{"command":"<bash command>","reasoning":"optional concise reason"}.'
|
| 341 |
+
),
|
| 342 |
+
}
|
| 343 |
+
)
|
| 344 |
+
command = NO_COMMAND_PROVIDED_SENTINEL
|
| 345 |
+
reasoning = None
|
| 346 |
+
else:
|
| 347 |
+
assistant_payload: dict[str, str] = {"command": command}
|
| 348 |
+
if reasoning:
|
| 349 |
+
assistant_payload["reasoning"] = reasoning
|
| 350 |
+
messages.append(
|
| 351 |
+
{"role": "assistant", "content": json.dumps(assistant_payload)}
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
result = env.step(Action(command=command))
|
| 355 |
+
obs = result.observation
|
| 356 |
+
rewards.append(result.reward)
|
| 357 |
+
done = result.done
|
| 358 |
+
|
| 359 |
+
error_value = result.info.get("error")
|
| 360 |
+
last_error = None if error_value in (None, "", "None") else str(error_value)
|
| 361 |
+
error_field = "null" if last_error is None else _single_line(last_error)
|
| 362 |
+
attempt_history.append(
|
| 363 |
+
{
|
| 364 |
+
"step": next_step,
|
| 365 |
+
"command": command,
|
| 366 |
+
"reasoning": reasoning,
|
| 367 |
+
"output": obs.command_output,
|
| 368 |
+
"error": last_error,
|
| 369 |
+
}
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
print(
|
| 373 |
+
f"[STEP] step={next_step} action={_format_step_action(command, reasoning)} "
|
| 374 |
+
f"reward={result.reward:.2f} done={_bool(done)} error={error_field}",
|
| 375 |
+
flush=True,
|
| 376 |
+
)
|
| 377 |
+
step = next_step
|
| 378 |
+
|
| 379 |
+
except Exception as exc:
|
| 380 |
+
last_error = str(exc)
|
| 381 |
+
print(
|
| 382 |
+
f"[ERROR] task={task_name.value} {type(exc).__name__}: {exc}",
|
| 383 |
+
flush=True,
|
| 384 |
+
)
|
| 385 |
+
finally:
|
| 386 |
+
score = _episode_score(rewards)
|
| 387 |
+
success = bool(done and score >= 0.95)
|
| 388 |
+
print(
|
| 389 |
+
_format_end_line(success=success, steps=step, score=score, rewards=rewards),
|
| 390 |
+
flush=True,
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def main() -> None:
|
| 395 |
+
if not API_KEY:
|
| 396 |
+
raise RuntimeError("HF_TOKEN (or API_KEY) must be set")
|
| 397 |
+
|
| 398 |
+
tasks = _parse_tasks()
|
| 399 |
+
|
| 400 |
+
from openai import OpenAI
|
| 401 |
+
|
| 402 |
+
client = OpenAI(
|
| 403 |
+
api_key=API_KEY,
|
| 404 |
+
base_url=API_BASE_URL,
|
| 405 |
+
timeout=30.0,
|
| 406 |
+
max_retries=2,
|
| 407 |
+
)
|
| 408 |
+
env = DistributedDebugEnvClient(base_url=ENV_URL)
|
| 409 |
+
|
| 410 |
+
try:
|
| 411 |
+
for task_name in tasks:
|
| 412 |
+
_run_episode(client, env, task_name)
|
| 413 |
+
finally:
|
| 414 |
+
env.close()
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
if __name__ == "__main__":
|
| 418 |
+
main()
|
mesh/auth/bun.lock
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"lockfileVersion": 1,
|
| 3 |
+
"configVersion": 1,
|
| 4 |
+
"workspaces": {
|
| 5 |
+
"": {
|
| 6 |
+
"name": "auth",
|
| 7 |
+
"devDependencies": {
|
| 8 |
+
"bun-types": "latest",
|
| 9 |
+
},
|
| 10 |
+
},
|
| 11 |
+
},
|
| 12 |
+
"packages": {
|
| 13 |
+
"@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="],
|
| 14 |
+
|
| 15 |
+
"bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
|
| 16 |
+
|
| 17 |
+
"undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
|
| 18 |
+
}
|
| 19 |
+
}
|
mesh/auth/config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"delay_ms": 200
|
| 3 |
+
}
|
mesh/auth/index.ts
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { existsSync } from "node:fs";
|
| 2 |
+
|
| 3 |
+
type AuthConfig = {
|
| 4 |
+
delay_ms: number;
|
| 5 |
+
};
|
| 6 |
+
|
| 7 |
+
const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
|
| 8 |
+
const CONFIG_PATH = `${MESH_ROOT}/auth/config.json`;
|
| 9 |
+
const CURRENT_TASK_PATH = "/tmp/current_task";
|
| 10 |
+
const PORT = 3001;
|
| 11 |
+
|
| 12 |
+
let config: AuthConfig;
|
| 13 |
+
|
| 14 |
+
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
| 15 |
+
|
| 16 |
+
const readJson = async <T>(path: string, fallback: T): Promise<T> => {
|
| 17 |
+
try {
|
| 18 |
+
return (await Bun.file(path).json()) as T;
|
| 19 |
+
} catch {
|
| 20 |
+
return fallback;
|
| 21 |
+
}
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
const currentPattern = async (): Promise<string> => {
|
| 25 |
+
if (!existsSync(CURRENT_TASK_PATH)) return "unknown";
|
| 26 |
+
try {
|
| 27 |
+
return (await Bun.file(CURRENT_TASK_PATH).text()).trim() || "unknown";
|
| 28 |
+
} catch {
|
| 29 |
+
return "unknown";
|
| 30 |
+
}
|
| 31 |
+
};
|
| 32 |
+
|
| 33 |
+
const log = async (event: string, level: "INFO" | "WARN" | "ERROR", details: Record<string, unknown> = {}) => {
|
| 34 |
+
console.log(
|
| 35 |
+
JSON.stringify({
|
| 36 |
+
ts: new Date().toISOString(),
|
| 37 |
+
level,
|
| 38 |
+
service: "auth",
|
| 39 |
+
event,
|
| 40 |
+
pattern: await currentPattern(),
|
| 41 |
+
...details,
|
| 42 |
+
}),
|
| 43 |
+
);
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
const loadConfig = async () => {
|
| 47 |
+
config = await readJson<AuthConfig>(CONFIG_PATH, { delay_ms: 200 });
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
await loadConfig();
|
| 51 |
+
|
| 52 |
+
process.on("SIGHUP", async () => {
|
| 53 |
+
await loadConfig();
|
| 54 |
+
await log("config_reloaded", "INFO", { config });
|
| 55 |
+
});
|
| 56 |
+
|
| 57 |
+
Bun.serve({
|
| 58 |
+
port: PORT,
|
| 59 |
+
async fetch(req) {
|
| 60 |
+
const url = new URL(req.url);
|
| 61 |
+
|
| 62 |
+
if (req.method === "GET" && url.pathname === "/health") {
|
| 63 |
+
return Response.json({ status: "ok", delay_ms: config.delay_ms });
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
if (req.method === "POST" && url.pathname === "/verify") {
|
| 67 |
+
const started = Date.now();
|
| 68 |
+
await log("verify_start", "INFO", { delay_ms: config.delay_ms });
|
| 69 |
+
|
| 70 |
+
await sleep(Math.max(0, config.delay_ms));
|
| 71 |
+
|
| 72 |
+
const elapsed = Date.now() - started;
|
| 73 |
+
await log("verify_complete", "INFO", {
|
| 74 |
+
delay_ms: config.delay_ms,
|
| 75 |
+
elapsed_ms: elapsed,
|
| 76 |
+
});
|
| 77 |
+
|
| 78 |
+
return Response.json({ verified: true, elapsed_ms: elapsed });
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
return new Response("not found", { status: 404 });
|
| 82 |
+
},
|
| 83 |
+
});
|
mesh/auth/package.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "auth",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"scripts": {
|
| 5 |
+
"start": "bun run index.ts"
|
| 6 |
+
},
|
| 7 |
+
"devDependencies": {
|
| 8 |
+
"bun-types": "latest"
|
| 9 |
+
}
|
| 10 |
+
}
|
mesh/auth/tsconfig.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2022",
|
| 4 |
+
"module": "ESNext",
|
| 5 |
+
"moduleResolution": "Bundler",
|
| 6 |
+
"strict": true,
|
| 7 |
+
"skipLibCheck": true,
|
| 8 |
+
"types": ["bun-types"]
|
| 9 |
+
}
|
| 10 |
+
}
|
mesh/gateway/blocked_routes.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"blocked": []
|
| 3 |
+
}
|
mesh/gateway/bun.lock
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"lockfileVersion": 1,
|
| 3 |
+
"configVersion": 1,
|
| 4 |
+
"workspaces": {
|
| 5 |
+
"": {
|
| 6 |
+
"name": "gateway",
|
| 7 |
+
"dependencies": {
|
| 8 |
+
"ioredis": "^5.4.1",
|
| 9 |
+
},
|
| 10 |
+
"devDependencies": {
|
| 11 |
+
"bun-types": "latest",
|
| 12 |
+
},
|
| 13 |
+
},
|
| 14 |
+
},
|
| 15 |
+
"packages": {
|
| 16 |
+
"@ioredis/commands": ["@ioredis/commands@1.5.1", "", {}, "sha512-JH8ZL/ywcJyR9MmJ5BNqZllXNZQqQbnVZOqpPQqE1vHiFgAw4NHbvE0FOduNU8IX9babitBT46571OnPTT0Zcw=="],
|
| 17 |
+
|
| 18 |
+
"@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="],
|
| 19 |
+
|
| 20 |
+
"bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
|
| 21 |
+
|
| 22 |
+
"cluster-key-slot": ["cluster-key-slot@1.1.2", "", {}, "sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA=="],
|
| 23 |
+
|
| 24 |
+
"debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
|
| 25 |
+
|
| 26 |
+
"denque": ["denque@2.1.0", "", {}, "sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw=="],
|
| 27 |
+
|
| 28 |
+
"ioredis": ["ioredis@5.10.1", "", { "dependencies": { "@ioredis/commands": "1.5.1", "cluster-key-slot": "^1.1.0", "debug": "^4.3.4", "denque": "^2.1.0", "lodash.defaults": "^4.2.0", "lodash.isarguments": "^3.1.0", "redis-errors": "^1.2.0", "redis-parser": "^3.0.0", "standard-as-callback": "^2.1.0" } }, "sha512-HuEDBTI70aYdx1v6U97SbNx9F1+svQKBDo30o0b9fw055LMepzpOOd0Ccg9Q6tbqmBSJaMuY0fB7yw9/vjBYCA=="],
|
| 29 |
+
|
| 30 |
+
"lodash.defaults": ["lodash.defaults@4.2.0", "", {}, "sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ=="],
|
| 31 |
+
|
| 32 |
+
"lodash.isarguments": ["lodash.isarguments@3.1.0", "", {}, "sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg=="],
|
| 33 |
+
|
| 34 |
+
"ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
|
| 35 |
+
|
| 36 |
+
"redis-errors": ["redis-errors@1.2.0", "", {}, "sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w=="],
|
| 37 |
+
|
| 38 |
+
"redis-parser": ["redis-parser@3.0.0", "", { "dependencies": { "redis-errors": "^1.0.0" } }, "sha512-DJnGAeenTdpMEH6uAJRK/uiyEIH9WVsUmoLwzudwGJUwZPp80PDBWPHXSAGNPwNvIXAbe7MSUB1zQFugFml66A=="],
|
| 39 |
+
|
| 40 |
+
"standard-as-callback": ["standard-as-callback@2.1.0", "", {}, "sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A=="],
|
| 41 |
+
|
| 42 |
+
"undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
|
| 43 |
+
}
|
| 44 |
+
}
|
mesh/gateway/config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auth_timeout_ms": 500
|
| 3 |
+
}
|
mesh/gateway/index.ts
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { existsSync } from "node:fs";
|
| 2 |
+
|
| 3 |
+
import Redis from "ioredis";
|
| 4 |
+
|
| 5 |
+
type GatewayConfig = {
|
| 6 |
+
auth_timeout_ms: number;
|
| 7 |
+
};
|
| 8 |
+
|
| 9 |
+
type Registry = {
|
| 10 |
+
services: {
|
| 11 |
+
auth: { host: string; port: number; protocol: string };
|
| 12 |
+
redis: { host: string; port: number; protocol: string };
|
| 13 |
+
worker: { host: string; port: number | null; protocol: string };
|
| 14 |
+
};
|
| 15 |
+
};
|
| 16 |
+
|
| 17 |
+
type BlockedRoutes = {
|
| 18 |
+
blocked: string[];
|
| 19 |
+
};
|
| 20 |
+
|
| 21 |
+
const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
|
| 22 |
+
const CONFIG_PATH = `${MESH_ROOT}/gateway/config.json`;
|
| 23 |
+
const BLOCKED_ROUTES_PATH = `${MESH_ROOT}/gateway/blocked_routes.json`;
|
| 24 |
+
const REGISTRY_PATH = `${MESH_ROOT}/registry.json`;
|
| 25 |
+
const CURRENT_TASK_PATH = "/tmp/current_task";
|
| 26 |
+
const PORT = 3000;
|
| 27 |
+
|
| 28 |
+
let config: GatewayConfig;
|
| 29 |
+
let registry: Registry;
|
| 30 |
+
let redisClient: Redis;
|
| 31 |
+
|
| 32 |
+
const successWindow: number[] = [];
|
| 33 |
+
const latencyWindow: number[] = [];
|
| 34 |
+
const WINDOW_SIZE = 20;
|
| 35 |
+
|
| 36 |
+
const readJson = async <T>(path: string, fallback: T): Promise<T> => {
|
| 37 |
+
try {
|
| 38 |
+
return (await Bun.file(path).json()) as T;
|
| 39 |
+
} catch {
|
| 40 |
+
return fallback;
|
| 41 |
+
}
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
const currentPattern = async (): Promise<string> => {
|
| 45 |
+
if (!existsSync(CURRENT_TASK_PATH)) return "unknown";
|
| 46 |
+
try {
|
| 47 |
+
return (await Bun.file(CURRENT_TASK_PATH).text()).trim() || "unknown";
|
| 48 |
+
} catch {
|
| 49 |
+
return "unknown";
|
| 50 |
+
}
|
| 51 |
+
};
|
| 52 |
+
|
| 53 |
+
const log = async (event: string, level: "INFO" | "WARN" | "ERROR", details: Record<string, unknown> = {}) => {
|
| 54 |
+
const payload = {
|
| 55 |
+
ts: new Date().toISOString(),
|
| 56 |
+
level,
|
| 57 |
+
service: "gateway",
|
| 58 |
+
event,
|
| 59 |
+
pattern: await currentPattern(),
|
| 60 |
+
...details,
|
| 61 |
+
};
|
| 62 |
+
console.log(JSON.stringify(payload));
|
| 63 |
+
};
|
| 64 |
+
|
| 65 |
+
const loadRuntimeState = async () => {
|
| 66 |
+
config = await readJson<GatewayConfig>(CONFIG_PATH, { auth_timeout_ms: 500 });
|
| 67 |
+
registry = await readJson<Registry>(REGISTRY_PATH, {
|
| 68 |
+
services: {
|
| 69 |
+
auth: { host: "localhost", port: 3001, protocol: "http" },
|
| 70 |
+
redis: { host: "localhost", port: 6379, protocol: "tcp" },
|
| 71 |
+
worker: { host: "localhost", port: null, protocol: "internal" },
|
| 72 |
+
},
|
| 73 |
+
});
|
| 74 |
+
|
| 75 |
+
if (redisClient) {
|
| 76 |
+
redisClient.disconnect();
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
redisClient = new Redis({
|
| 80 |
+
host: registry.services.redis.host,
|
| 81 |
+
port: registry.services.redis.port,
|
| 82 |
+
maxRetriesPerRequest: 1,
|
| 83 |
+
lazyConnect: false,
|
| 84 |
+
});
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
const fetchWithTimeout = async (
|
| 88 |
+
url: string,
|
| 89 |
+
init: RequestInit,
|
| 90 |
+
timeoutMs: number,
|
| 91 |
+
): Promise<Response> => {
|
| 92 |
+
const controller = new AbortController();
|
| 93 |
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
| 94 |
+
try {
|
| 95 |
+
return await fetch(url, { ...init, signal: controller.signal });
|
| 96 |
+
} finally {
|
| 97 |
+
clearTimeout(timer);
|
| 98 |
+
}
|
| 99 |
+
};
|
| 100 |
+
|
| 101 |
+
const percentile99 = (values: number[]): number => {
|
| 102 |
+
if (!values.length) return 0;
|
| 103 |
+
const sorted = [...values].sort((a, b) => a - b);
|
| 104 |
+
const idx = Math.min(sorted.length - 1, Math.floor(0.99 * sorted.length));
|
| 105 |
+
return sorted[idx];
|
| 106 |
+
};
|
| 107 |
+
|
| 108 |
+
const recordOutcome = (success: boolean, elapsedMs: number) => {
|
| 109 |
+
successWindow.push(success ? 1 : 0);
|
| 110 |
+
latencyWindow.push(elapsedMs);
|
| 111 |
+
|
| 112 |
+
if (successWindow.length > WINDOW_SIZE) {
|
| 113 |
+
successWindow.shift();
|
| 114 |
+
}
|
| 115 |
+
if (latencyWindow.length > WINDOW_SIZE) {
|
| 116 |
+
latencyWindow.shift();
|
| 117 |
+
}
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
const getSuccessRate = (): number => {
|
| 121 |
+
if (!successWindow.length) return 1;
|
| 122 |
+
const successes = successWindow.reduce((acc, v) => acc + v, 0);
|
| 123 |
+
return successes / successWindow.length;
|
| 124 |
+
};
|
| 125 |
+
|
| 126 |
+
await loadRuntimeState();
|
| 127 |
+
|
| 128 |
+
process.on("SIGHUP", async () => {
|
| 129 |
+
await loadRuntimeState();
|
| 130 |
+
await log("config_reloaded", "INFO", { config });
|
| 131 |
+
});
|
| 132 |
+
|
| 133 |
+
Bun.serve({
|
| 134 |
+
port: PORT,
|
| 135 |
+
async fetch(req) {
|
| 136 |
+
const url = new URL(req.url);
|
| 137 |
+
|
| 138 |
+
if (req.method === "GET" && url.pathname === "/health") {
|
| 139 |
+
return Response.json({
|
| 140 |
+
status: "ok",
|
| 141 |
+
success_rate: getSuccessRate(),
|
| 142 |
+
p99_latency_ms: percentile99(latencyWindow),
|
| 143 |
+
});
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
if (req.method === "POST" && url.pathname === "/process") {
|
| 147 |
+
const start = Date.now();
|
| 148 |
+
|
| 149 |
+
try {
|
| 150 |
+
const blockedRoutes = await readJson<BlockedRoutes>(BLOCKED_ROUTES_PATH, { blocked: [] });
|
| 151 |
+
if (blockedRoutes.blocked.includes("gateway->redis")) {
|
| 152 |
+
await log("route_blocked", "ERROR", {
|
| 153 |
+
route: "gateway->redis",
|
| 154 |
+
policy_file: BLOCKED_ROUTES_PATH,
|
| 155 |
+
});
|
| 156 |
+
throw new Error("redis_unreachable");
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
const authUrl = `http://${registry.services.auth.host}:${registry.services.auth.port}/verify`;
|
| 160 |
+
const authResponse = await fetchWithTimeout(
|
| 161 |
+
authUrl,
|
| 162 |
+
{
|
| 163 |
+
method: "POST",
|
| 164 |
+
headers: { "content-type": "application/json" },
|
| 165 |
+
body: JSON.stringify({ token: "allow" }),
|
| 166 |
+
},
|
| 167 |
+
config.auth_timeout_ms,
|
| 168 |
+
);
|
| 169 |
+
if (!authResponse.ok) {
|
| 170 |
+
throw new Error(`auth_status_${authResponse.status}`);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
const job = JSON.stringify({ id: crypto.randomUUID(), payload: "ok" });
|
| 174 |
+
await redisClient.rpush("job_queue", job);
|
| 175 |
+
|
| 176 |
+
const elapsedMs = Date.now() - start;
|
| 177 |
+
recordOutcome(true, elapsedMs);
|
| 178 |
+
await log("request_complete", "INFO", {
|
| 179 |
+
elapsed_ms: elapsedMs,
|
| 180 |
+
upstream: "auth",
|
| 181 |
+
queue_depth_hint: await redisClient.llen("job_queue"),
|
| 182 |
+
});
|
| 183 |
+
|
| 184 |
+
return Response.json({ ok: true, elapsed_ms: elapsedMs });
|
| 185 |
+
} catch (error) {
|
| 186 |
+
const elapsedMs = Date.now() - start;
|
| 187 |
+
recordOutcome(false, elapsedMs);
|
| 188 |
+
|
| 189 |
+
const reason = error instanceof Error ? error.message : String(error);
|
| 190 |
+
if (reason === "AbortError" || reason.includes("aborted") || reason.includes("timeout")) {
|
| 191 |
+
await log("upstream_timeout", "ERROR", {
|
| 192 |
+
elapsed_ms: elapsedMs,
|
| 193 |
+
upstream: "auth",
|
| 194 |
+
threshold_ms: config.auth_timeout_ms,
|
| 195 |
+
});
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
await log("request_failed", "ERROR", {
|
| 199 |
+
path: "/process",
|
| 200 |
+
status: 500,
|
| 201 |
+
reason,
|
| 202 |
+
});
|
| 203 |
+
|
| 204 |
+
return new Response(JSON.stringify({ error: reason }), {
|
| 205 |
+
status: 500,
|
| 206 |
+
headers: { "content-type": "application/json" },
|
| 207 |
+
});
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
return new Response("not found", { status: 404 });
|
| 212 |
+
},
|
| 213 |
+
});
|
mesh/gateway/package.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "gateway",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"scripts": {
|
| 5 |
+
"start": "bun run index.ts"
|
| 6 |
+
},
|
| 7 |
+
"dependencies": {
|
| 8 |
+
"ioredis": "^5.4.1"
|
| 9 |
+
},
|
| 10 |
+
"devDependencies": {
|
| 11 |
+
"bun-types": "latest"
|
| 12 |
+
}
|
| 13 |
+
}
|
mesh/gateway/tsconfig.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2022",
|
| 4 |
+
"module": "ESNext",
|
| 5 |
+
"moduleResolution": "Bundler",
|
| 6 |
+
"strict": true,
|
| 7 |
+
"skipLibCheck": true,
|
| 8 |
+
"types": ["bun-types"]
|
| 9 |
+
}
|
| 10 |
+
}
|
mesh/registry.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"services": {
|
| 3 |
+
"auth": {
|
| 4 |
+
"host": "localhost",
|
| 5 |
+
"port": 3001,
|
| 6 |
+
"protocol": "http"
|
| 7 |
+
},
|
| 8 |
+
"redis": {
|
| 9 |
+
"host": "localhost",
|
| 10 |
+
"port": 6379,
|
| 11 |
+
"protocol": "tcp"
|
| 12 |
+
},
|
| 13 |
+
"worker": {
|
| 14 |
+
"host": "localhost",
|
| 15 |
+
"port": null,
|
| 16 |
+
"protocol": "internal"
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
}
|
mesh/worker/bun.lock
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"lockfileVersion": 1,
|
| 3 |
+
"configVersion": 1,
|
| 4 |
+
"workspaces": {
|
| 5 |
+
"": {
|
| 6 |
+
"name": "worker",
|
| 7 |
+
"dependencies": {
|
| 8 |
+
"ioredis": "^5.4.1",
|
| 9 |
+
},
|
| 10 |
+
"devDependencies": {
|
| 11 |
+
"bun-types": "latest",
|
| 12 |
+
},
|
| 13 |
+
},
|
| 14 |
+
},
|
| 15 |
+
"packages": {
|
| 16 |
+
"@ioredis/commands": ["@ioredis/commands@1.5.1", "", {}, "sha512-JH8ZL/ywcJyR9MmJ5BNqZllXNZQqQbnVZOqpPQqE1vHiFgAw4NHbvE0FOduNU8IX9babitBT46571OnPTT0Zcw=="],
|
| 17 |
+
|
| 18 |
+
"@types/node": ["@types/node@25.5.2", "", { "dependencies": { "undici-types": "~7.18.0" } }, "sha512-tO4ZIRKNC+MDWV4qKVZe3Ql/woTnmHDr5JD8UI5hn2pwBrHEwOEMZK7WlNb5RKB6EoJ02gwmQS9OrjuFnZYdpg=="],
|
| 19 |
+
|
| 20 |
+
"bun-types": ["bun-types@1.3.11", "", { "dependencies": { "@types/node": "*" } }, "sha512-1KGPpoxQWl9f6wcZh57LvrPIInQMn2TQ7jsgxqpRzg+l0QPOFvJVH7HmvHo/AiPgwXy+/Thf6Ov3EdVn1vOabg=="],
|
| 21 |
+
|
| 22 |
+
"cluster-key-slot": ["cluster-key-slot@1.1.2", "", {}, "sha512-RMr0FhtfXemyinomL4hrWcYJxmX6deFdCxpJzhDttxgO1+bcCnkk+9drydLVDmAMG7NE6aN/fl4F7ucU/90gAA=="],
|
| 23 |
+
|
| 24 |
+
"debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
|
| 25 |
+
|
| 26 |
+
"denque": ["denque@2.1.0", "", {}, "sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw=="],
|
| 27 |
+
|
| 28 |
+
"ioredis": ["ioredis@5.10.1", "", { "dependencies": { "@ioredis/commands": "1.5.1", "cluster-key-slot": "^1.1.0", "debug": "^4.3.4", "denque": "^2.1.0", "lodash.defaults": "^4.2.0", "lodash.isarguments": "^3.1.0", "redis-errors": "^1.2.0", "redis-parser": "^3.0.0", "standard-as-callback": "^2.1.0" } }, "sha512-HuEDBTI70aYdx1v6U97SbNx9F1+svQKBDo30o0b9fw055LMepzpOOd0Ccg9Q6tbqmBSJaMuY0fB7yw9/vjBYCA=="],
|
| 29 |
+
|
| 30 |
+
"lodash.defaults": ["lodash.defaults@4.2.0", "", {}, "sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ=="],
|
| 31 |
+
|
| 32 |
+
"lodash.isarguments": ["lodash.isarguments@3.1.0", "", {}, "sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg=="],
|
| 33 |
+
|
| 34 |
+
"ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
|
| 35 |
+
|
| 36 |
+
"redis-errors": ["redis-errors@1.2.0", "", {}, "sha512-1qny3OExCf0UvUV/5wpYKf2YwPcOqXzkwKKSmKHiE6ZMQs5heeE/c8eXK+PNllPvmjgAbfnsbpkGZWy8cBpn9w=="],
|
| 37 |
+
|
| 38 |
+
"redis-parser": ["redis-parser@3.0.0", "", { "dependencies": { "redis-errors": "^1.0.0" } }, "sha512-DJnGAeenTdpMEH6uAJRK/uiyEIH9WVsUmoLwzudwGJUwZPp80PDBWPHXSAGNPwNvIXAbe7MSUB1zQFugFml66A=="],
|
| 39 |
+
|
| 40 |
+
"standard-as-callback": ["standard-as-callback@2.1.0", "", {}, "sha512-qoRRSyROncaz1z0mvYqIE4lCd9p2R90i6GxW3uZv5ucSu8tU7B5HXUP1gG8pVZsYNVaXjk8ClXHPttLyxAL48A=="],
|
| 41 |
+
|
| 42 |
+
"undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
|
| 43 |
+
}
|
| 44 |
+
}
|
mesh/worker/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"db_pool_size": 10,
|
| 3 |
+
"db_write_delay_ms": 0
|
| 4 |
+
}
|
mesh/worker/index.ts
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { existsSync } from "node:fs";
|
| 2 |
+
|
| 3 |
+
import Redis from "ioredis";
|
| 4 |
+
import { Database } from "bun:sqlite";
|
| 5 |
+
|
| 6 |
+
type WorkerConfig = {
|
| 7 |
+
db_pool_size: number;
|
| 8 |
+
db_write_delay_ms: number;
|
| 9 |
+
};
|
| 10 |
+
|
| 11 |
+
const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
|
| 12 |
+
const CONFIG_PATH = `${MESH_ROOT}/worker/config.json`;
|
| 13 |
+
const CURRENT_TASK_PATH = "/tmp/current_task";
|
| 14 |
+
|
| 15 |
+
const LOCK_KEY = "LOCK:job_processor";
|
| 16 |
+
const ENQUEUE_RATE_PER_S = Number(process.env.ENQUEUE_RATE_PER_S || "3.0");
|
| 17 |
+
|
| 18 |
+
let config: WorkerConfig;
|
| 19 |
+
let running = true;
|
| 20 |
+
let backoffMs = 1000;
|
| 21 |
+
|
| 22 |
+
const redis = new Redis({ host: "localhost", port: 6379, maxRetriesPerRequest: 1 });
|
| 23 |
+
const db = new Database("/tmp/worker_jobs.sqlite");
|
| 24 |
+
db.exec(
|
| 25 |
+
"CREATE TABLE IF NOT EXISTS jobs (id TEXT PRIMARY KEY, payload TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP)",
|
| 26 |
+
);
|
| 27 |
+
|
| 28 |
+
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
| 29 |
+
|
| 30 |
+
const readJson = async <T>(path: string, fallback: T): Promise<T> => {
|
| 31 |
+
try {
|
| 32 |
+
return (await Bun.file(path).json()) as T;
|
| 33 |
+
} catch {
|
| 34 |
+
return fallback;
|
| 35 |
+
}
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
const currentPattern = async (): Promise<string> => {
|
| 39 |
+
if (!existsSync(CURRENT_TASK_PATH)) return "unknown";
|
| 40 |
+
try {
|
| 41 |
+
return (await Bun.file(CURRENT_TASK_PATH).text()).trim() || "unknown";
|
| 42 |
+
} catch {
|
| 43 |
+
return "unknown";
|
| 44 |
+
}
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
const incrementCounter = async (path: string): Promise<number> => {
|
| 49 |
+
let current = 0;
|
| 50 |
+
try {
|
| 51 |
+
current = Number(await Bun.file(path).text()) || 0;
|
| 52 |
+
} catch {
|
| 53 |
+
current = 0;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
const next = current + 1;
|
| 57 |
+
await Bun.write(path, String(next));
|
| 58 |
+
return next;
|
| 59 |
+
};
|
| 60 |
+
|
| 61 |
+
const log = async (event: string, level: "INFO" | "WARN" | "ERROR", details: Record<string, unknown> = {}) => {
|
| 62 |
+
console.log(
|
| 63 |
+
JSON.stringify({
|
| 64 |
+
ts: new Date().toISOString(),
|
| 65 |
+
level,
|
| 66 |
+
service: "worker",
|
| 67 |
+
event,
|
| 68 |
+
pattern: await currentPattern(),
|
| 69 |
+
...details,
|
| 70 |
+
}),
|
| 71 |
+
);
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
const loadConfig = async () => {
|
| 75 |
+
config = await readJson<WorkerConfig>(CONFIG_PATH, {
|
| 76 |
+
db_pool_size: 10,
|
| 77 |
+
db_write_delay_ms: 0,
|
| 78 |
+
});
|
| 79 |
+
};
|
| 80 |
+
|
| 81 |
+
const estimatedProcessingRate = (): number => {
|
| 82 |
+
const unitCostMs = Math.max(50, config.db_write_delay_ms + 120);
|
| 83 |
+
return config.db_pool_size * (1000 / unitCostMs);
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
const processLoop = async () => {
|
| 87 |
+
while (running) {
|
| 88 |
+
try {
|
| 89 |
+
const acquired = await redis.set(LOCK_KEY, String(process.pid), "EX", 30, "NX");
|
| 90 |
+
if (!acquired) {
|
| 91 |
+
const stallCount = await incrementCounter("/tmp/consumer_stall_count");
|
| 92 |
+
await log("lock_acquire_failed", "WARN", {
|
| 93 |
+
lock_key: LOCK_KEY,
|
| 94 |
+
stall_count: stallCount,
|
| 95 |
+
});
|
| 96 |
+
await sleep(120);
|
| 97 |
+
continue;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
const raw = await redis.lpop("job_queue");
|
| 101 |
+
if (!raw) {
|
| 102 |
+
await redis.del(LOCK_KEY);
|
| 103 |
+
await sleep(120);
|
| 104 |
+
continue;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
let parsed: { id?: string; payload?: unknown };
|
| 108 |
+
try {
|
| 109 |
+
parsed = JSON.parse(raw) as { id?: string; payload?: unknown };
|
| 110 |
+
} catch (error) {
|
| 111 |
+
const restartCount = await incrementCounter("/tmp/worker_restart_count");
|
| 112 |
+
await log("job_dequeued", "INFO", { raw });
|
| 113 |
+
await log("parse_failed", "ERROR", {
|
| 114 |
+
error: error instanceof Error ? error.message : String(error),
|
| 115 |
+
raw,
|
| 116 |
+
});
|
| 117 |
+
await log("consumer_backoff", "WARN", {
|
| 118 |
+
restart_count: restartCount,
|
| 119 |
+
backoff_ms: backoffMs,
|
| 120 |
+
});
|
| 121 |
+
|
| 122 |
+
await redis.lpush("job_queue", raw);
|
| 123 |
+
await redis.del(LOCK_KEY);
|
| 124 |
+
await sleep(backoffMs);
|
| 125 |
+
backoffMs = Math.min(10000, backoffMs * 2);
|
| 126 |
+
continue;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
const start = Date.now();
|
| 130 |
+
if (config.db_write_delay_ms > 0) {
|
| 131 |
+
await sleep(config.db_write_delay_ms);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
db.query("INSERT OR REPLACE INTO jobs (id, payload) VALUES (?, ?)").run(
|
| 135 |
+
parsed.id || crypto.randomUUID(),
|
| 136 |
+
JSON.stringify(parsed.payload ?? null),
|
| 137 |
+
);
|
| 138 |
+
|
| 139 |
+
backoffMs = 1000;
|
| 140 |
+
const elapsedMs = Date.now() - start;
|
| 141 |
+
const queueDepth = Number(await redis.llen("job_queue"));
|
| 142 |
+
await log("db_write_complete", "INFO", {
|
| 143 |
+
elapsed_ms: elapsedMs,
|
| 144 |
+
pool_size: config.db_pool_size,
|
| 145 |
+
});
|
| 146 |
+
|
| 147 |
+
const processingRate = estimatedProcessingRate();
|
| 148 |
+
if (processingRate < ENQUEUE_RATE_PER_S || queueDepth > 10) {
|
| 149 |
+
await log("throughput_lag", "WARN", {
|
| 150 |
+
processing_rate_per_s: Number(processingRate.toFixed(2)),
|
| 151 |
+
enqueue_rate_per_s: ENQUEUE_RATE_PER_S,
|
| 152 |
+
queue_depth: queueDepth,
|
| 153 |
+
});
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
await log("job_processed", "INFO", {
|
| 157 |
+
job_id: parsed.id || null,
|
| 158 |
+
queue_depth: queueDepth,
|
| 159 |
+
});
|
| 160 |
+
|
| 161 |
+
await redis.del(LOCK_KEY);
|
| 162 |
+
await sleep(80);
|
| 163 |
+
} catch (error) {
|
| 164 |
+
await log("loop_error", "ERROR", {
|
| 165 |
+
error: error instanceof Error ? error.message : String(error),
|
| 166 |
+
});
|
| 167 |
+
await sleep(250);
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
};
|
| 171 |
+
|
| 172 |
+
if (!existsSync("/tmp/worker_restart_count")) await Bun.write("/tmp/worker_restart_count", "0");
|
| 173 |
+
if (!existsSync("/tmp/consumer_stall_count")) await Bun.write("/tmp/consumer_stall_count", "0");
|
| 174 |
+
await Bun.write("/tmp/worker.pid", String(process.pid));
|
| 175 |
+
|
| 176 |
+
await loadConfig();
|
| 177 |
+
|
| 178 |
+
process.on("SIGHUP", async () => {
|
| 179 |
+
await loadConfig();
|
| 180 |
+
await log("config_reloaded", "INFO", { config });
|
| 181 |
+
});
|
| 182 |
+
|
| 183 |
+
process.on("SIGTERM", () => {
|
| 184 |
+
running = false;
|
| 185 |
+
});
|
| 186 |
+
|
| 187 |
+
process.on("SIGINT", () => {
|
| 188 |
+
running = false;
|
| 189 |
+
});
|
| 190 |
+
|
| 191 |
+
await processLoop();
|
| 192 |
+
|
| 193 |
+
try {
|
| 194 |
+
await redis.quit();
|
| 195 |
+
} catch {
|
| 196 |
+
redis.disconnect();
|
| 197 |
+
}
|
| 198 |
+
db.close();
|
mesh/worker/job_generator.ts
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Redis from "ioredis";
|
| 2 |
+
|
| 3 |
+
type JobGeneratorConfig = {
|
| 4 |
+
interval_ms: number;
|
| 5 |
+
};
|
| 6 |
+
|
| 7 |
+
const redis = new Redis({ host: "localhost", port: 6379, maxRetriesPerRequest: 1 });
|
| 8 |
+
const MESH_ROOT = process.env.MESH_ROOT || "/mesh";
|
| 9 |
+
const CONFIG_PATH = `${MESH_ROOT}/worker/job_generator_config.json`;
|
| 10 |
+
|
| 11 |
+
let running = true;
|
| 12 |
+
let intervalMs = 333;
|
| 13 |
+
|
| 14 |
+
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
| 15 |
+
|
| 16 |
+
const readJson = async <T>(path: string, fallback: T): Promise<T> => {
|
| 17 |
+
try {
|
| 18 |
+
return (await Bun.file(path).json()) as T;
|
| 19 |
+
} catch {
|
| 20 |
+
return fallback;
|
| 21 |
+
}
|
| 22 |
+
};
|
| 23 |
+
|
| 24 |
+
const loadConfig = async () => {
|
| 25 |
+
const config = await readJson<JobGeneratorConfig>(CONFIG_PATH, { interval_ms: 333 });
|
| 26 |
+
intervalMs = Math.max(10, Number(config.interval_ms) || 333);
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
const loop = async () => {
|
| 30 |
+
while (running) {
|
| 31 |
+
const job = JSON.stringify({
|
| 32 |
+
id: crypto.randomUUID(),
|
| 33 |
+
payload: {
|
| 34 |
+
kind: "normal",
|
| 35 |
+
ts: new Date().toISOString(),
|
| 36 |
+
},
|
| 37 |
+
});
|
| 38 |
+
|
| 39 |
+
try {
|
| 40 |
+
await redis.rpush("job_queue", job);
|
| 41 |
+
console.log(
|
| 42 |
+
JSON.stringify({
|
| 43 |
+
ts: new Date().toISOString(),
|
| 44 |
+
level: "INFO",
|
| 45 |
+
service: "job_generator",
|
| 46 |
+
event: "job_enqueued",
|
| 47 |
+
}),
|
| 48 |
+
);
|
| 49 |
+
} catch (error) {
|
| 50 |
+
console.log(
|
| 51 |
+
JSON.stringify({
|
| 52 |
+
ts: new Date().toISOString(),
|
| 53 |
+
level: "ERROR",
|
| 54 |
+
service: "job_generator",
|
| 55 |
+
event: "enqueue_failed",
|
| 56 |
+
error: error instanceof Error ? error.message : String(error),
|
| 57 |
+
}),
|
| 58 |
+
);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
await sleep(intervalMs);
|
| 62 |
+
}
|
| 63 |
+
};
|
| 64 |
+
|
| 65 |
+
await loadConfig();
|
| 66 |
+
|
| 67 |
+
process.on("SIGHUP", async () => {
|
| 68 |
+
await loadConfig();
|
| 69 |
+
console.log(
|
| 70 |
+
JSON.stringify({
|
| 71 |
+
ts: new Date().toISOString(),
|
| 72 |
+
level: "INFO",
|
| 73 |
+
service: "job_generator",
|
| 74 |
+
event: "config_reloaded",
|
| 75 |
+
interval_ms: intervalMs,
|
| 76 |
+
}),
|
| 77 |
+
);
|
| 78 |
+
});
|
| 79 |
+
|
| 80 |
+
process.on("SIGTERM", () => {
|
| 81 |
+
running = false;
|
| 82 |
+
});
|
| 83 |
+
|
| 84 |
+
process.on("SIGINT", () => {
|
| 85 |
+
running = false;
|
| 86 |
+
});
|
| 87 |
+
|
| 88 |
+
await loop();
|
| 89 |
+
|
| 90 |
+
try {
|
| 91 |
+
await redis.quit();
|
| 92 |
+
} catch {
|
| 93 |
+
redis.disconnect();
|
| 94 |
+
}
|
mesh/worker/job_generator_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"interval_ms": 10
|
| 3 |
+
}
|
mesh/worker/package.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "worker",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"scripts": {
|
| 5 |
+
"start": "bun run index.ts",
|
| 6 |
+
"job-generator": "bun run job_generator.ts"
|
| 7 |
+
},
|
| 8 |
+
"dependencies": {
|
| 9 |
+
"ioredis": "^5.4.1"
|
| 10 |
+
},
|
| 11 |
+
"devDependencies": {
|
| 12 |
+
"bun-types": "latest"
|
| 13 |
+
}
|
| 14 |
+
}
|
mesh/worker/tsconfig.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2022",
|
| 4 |
+
"module": "ESNext",
|
| 5 |
+
"moduleResolution": "Bundler",
|
| 6 |
+
"strict": true,
|
| 7 |
+
"skipLibCheck": true,
|
| 8 |
+
"types": ["bun-types"]
|
| 9 |
+
}
|
| 10 |
+
}
|
openenv.yaml
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: distributed-systems-debug-env
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
An RL environment for debugging a distributed job processing pipeline with
|
| 5 |
+
five deterministic fault scenarios: cascading timeout, byzantine queue fault,
|
| 6 |
+
distributed lock starvation, backpressure cascade, and route partition.
|
| 7 |
+
author: Team Rocket
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
- distributed-systems
|
| 11 |
+
- debugging
|
| 12 |
+
- reinforcement-learning
|
| 13 |
+
|
| 14 |
+
reward_range: [0.0, 1.0]
|
| 15 |
+
|
| 16 |
+
tasks:
|
| 17 |
+
- name: cascading-timeout
|
| 18 |
+
description: Synchronous upstream delay exceeds gateway timeout.
|
| 19 |
+
difficulty: easy
|
| 20 |
+
max_steps: 15
|
| 21 |
+
|
| 22 |
+
- name: byzantine-queue-fault
|
| 23 |
+
description: Poison pill message causes worker parse crash-loop.
|
| 24 |
+
difficulty: medium
|
| 25 |
+
max_steps: 18
|
| 26 |
+
|
| 27 |
+
- name: distributed-lock-starvation
|
| 28 |
+
description: Stale distributed lock prevents queue consumption.
|
| 29 |
+
difficulty: hard
|
| 30 |
+
max_steps: 20
|
| 31 |
+
|
| 32 |
+
- name: backpressure-cascade
|
| 33 |
+
description: Consumer throughput lower than producer enqueue rate.
|
| 34 |
+
difficulty: hard
|
| 35 |
+
max_steps: 20
|
| 36 |
+
|
| 37 |
+
- name: route-partition
|
| 38 |
+
description: Route policy blocks gateway to redis communication.
|
| 39 |
+
difficulty: hard
|
| 40 |
+
max_steps: 20
|
| 41 |
+
|
| 42 |
+
- name: registry-corruption
|
| 43 |
+
description: Gateway reloads a corrupted auth registry entry and fails requests.
|
| 44 |
+
difficulty: medium
|
| 45 |
+
max_steps: 18
|
| 46 |
+
|
| 47 |
+
- name: job-generator-runaway
|
| 48 |
+
description: Runaway enqueue rate overwhelms worker throughput and grows backlog.
|
| 49 |
+
difficulty: hard
|
| 50 |
+
max_steps: 20
|
| 51 |
+
|
| 52 |
+
observation_space:
|
| 53 |
+
type: object
|
| 54 |
+
properties:
|
| 55 |
+
command_output:
|
| 56 |
+
type: string
|
| 57 |
+
metrics:
|
| 58 |
+
type: object
|
| 59 |
+
properties:
|
| 60 |
+
gateway_success_rate:
|
| 61 |
+
type: number
|
| 62 |
+
minimum: 0.0
|
| 63 |
+
maximum: 1.0
|
| 64 |
+
gateway_p99_latency_ms:
|
| 65 |
+
type: number
|
| 66 |
+
minimum: 0.0
|
| 67 |
+
queue_depth:
|
| 68 |
+
type: integer
|
| 69 |
+
minimum: 0
|
| 70 |
+
worker_restart_count:
|
| 71 |
+
type: integer
|
| 72 |
+
minimum: 0
|
| 73 |
+
consumer_stall_count:
|
| 74 |
+
type: integer
|
| 75 |
+
minimum: 0
|
| 76 |
+
process_status:
|
| 77 |
+
type: object
|
| 78 |
+
additionalProperties:
|
| 79 |
+
type: string
|
| 80 |
+
|
| 81 |
+
action_space:
|
| 82 |
+
type: object
|
| 83 |
+
properties:
|
| 84 |
+
command:
|
| 85 |
+
type: string
|
| 86 |
+
description: Single bash command executed in the debug sandbox.
|
pyproject.toml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "distributed-systems-debug-env"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "OpenEnv distributed systems debugging environment"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = "==3.12.*"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"fastapi==0.115.0",
|
| 9 |
+
"uvicorn[standard]==0.30.0",
|
| 10 |
+
"pydantic>=2.11.0",
|
| 11 |
+
"openenv-core>=0.2.0",
|
| 12 |
+
"openai>=1.30.0",
|
| 13 |
+
"httpx>=0.27.0"
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
[project.scripts]
|
| 17 |
+
server = "server.app:main"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
[tool.pytest.ini_options]
|
| 21 |
+
testpaths = ["tests"]
|
| 22 |
+
pythonpath = ["."]
|
| 23 |
+
|
| 24 |
+
[build-system]
|
| 25 |
+
requires = ["setuptools>=68", "wheel"]
|
| 26 |
+
build-backend = "setuptools.build_meta"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
[tool.setuptools.packages.find]
|
| 30 |
+
where = ["."]
|
| 31 |
+
include = ["server*"]
|
| 32 |
+
exclude = ["mesh*", "tests*"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn[standard]==0.30.0
|
| 3 |
+
pydantic>=2.11.0
|
| 4 |
+
openenv-core>=0.2.0
|
| 5 |
+
openai>=1.30.0
|
| 6 |
+
httpx>=0.27.0
|
| 7 |
+
pytest>=8.0.0
|
server/__init__.py
ADDED
|
File without changes
|
server/api.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from contextlib import asynccontextmanager
|
| 4 |
+
|
| 5 |
+
from fastapi import FastAPI, HTTPException
|
| 6 |
+
|
| 7 |
+
from .constants import TaskName
|
| 8 |
+
from .env import DistributedDebugEnv
|
| 9 |
+
from .models import Action, Observation, StepResult
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@asynccontextmanager
|
| 13 |
+
async def lifespan(app: FastAPI):
|
| 14 |
+
env = DistributedDebugEnv()
|
| 15 |
+
env.start()
|
| 16 |
+
app.state.env = env
|
| 17 |
+
try:
|
| 18 |
+
yield
|
| 19 |
+
finally:
|
| 20 |
+
env.close()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
app = FastAPI(title="Distributed Systems Debug Environment", version="1.0.0", lifespan=lifespan)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@app.post("/reset", response_model=Observation)
|
| 27 |
+
async def reset(task_name: str | None = None) -> Observation:
|
| 28 |
+
# Validator and sample inference call /reset without task input.
|
| 29 |
+
# Use a deterministic default task for reproducible episode bootstrapping.
|
| 30 |
+
selected_task_name = task_name or TaskName.CASCADING_TIMEOUT.value
|
| 31 |
+
try:
|
| 32 |
+
task = TaskName.parse(selected_task_name)
|
| 33 |
+
except ValueError as exc:
|
| 34 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
env: DistributedDebugEnv = app.state.env
|
| 38 |
+
return env.reset(task_name=task)
|
| 39 |
+
except Exception as exc:
|
| 40 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.post("/step", response_model=StepResult)
|
| 44 |
+
async def step(action: Action) -> StepResult:
|
| 45 |
+
try:
|
| 46 |
+
env: DistributedDebugEnv = app.state.env
|
| 47 |
+
return env.step(action)
|
| 48 |
+
except Exception as exc:
|
| 49 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@app.get("/state")
|
| 53 |
+
async def state() -> dict:
|
| 54 |
+
try:
|
| 55 |
+
env: DistributedDebugEnv = app.state.env
|
| 56 |
+
return env.state()
|
| 57 |
+
except Exception as exc:
|
| 58 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@app.get("/health")
|
| 62 |
+
async def health() -> dict:
|
| 63 |
+
return {"status": "ok", "version": "1.0.0"}
|
server/app.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .api import app
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def main() -> None:
|
| 5 |
+
import uvicorn
|
| 6 |
+
|
| 7 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
main()
|
server/constants.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class TaskName(str, Enum):
|
| 5 |
+
CASCADING_TIMEOUT = "cascading-timeout"
|
| 6 |
+
BYZANTINE_QUEUE_FAULT = "byzantine-queue-fault"
|
| 7 |
+
DISTRIBUTED_LOCK_STARVATION = "distributed-lock-starvation"
|
| 8 |
+
BACKPRESSURE_CASCADE = "backpressure-cascade"
|
| 9 |
+
ROUTE_PARTITION = "route-partition"
|
| 10 |
+
REGISTRY_CORRUPTION = "registry-corruption"
|
| 11 |
+
JOB_GENERATOR_RUNAWAY = "job-generator-runaway"
|
| 12 |
+
|
| 13 |
+
@classmethod
|
| 14 |
+
def parse(cls, value: str) -> "TaskName":
|
| 15 |
+
try:
|
| 16 |
+
return cls(value)
|
| 17 |
+
except ValueError as exc:
|
| 18 |
+
raise ValueError(f"Unknown task: {value}") from exc
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
ALL_TASKS = [task.value for task in TaskName]
|
| 22 |
+
|
| 23 |
+
# Baseline default: easy + medium + hard (deadlock/starvation included).
|
| 24 |
+
DEFAULT_BASELINE_TASKS = [
|
| 25 |
+
TaskName.CASCADING_TIMEOUT.value,
|
| 26 |
+
TaskName.BYZANTINE_QUEUE_FAULT.value,
|
| 27 |
+
TaskName.DISTRIBUTED_LOCK_STARVATION.value,
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
DEFAULT_BASELINE_TASK_ENUMS = [
|
| 31 |
+
TaskName.CASCADING_TIMEOUT,
|
| 32 |
+
TaskName.BYZANTINE_QUEUE_FAULT,
|
| 33 |
+
TaskName.DISTRIBUTED_LOCK_STARVATION,
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
NO_COMMAND_PROVIDED_SENTINEL = "__NO_COMMAND_PROVIDED__"
|
| 37 |
+
|
| 38 |
+
TASK_MAX_STEPS = {
|
| 39 |
+
TaskName.CASCADING_TIMEOUT: 15,
|
| 40 |
+
TaskName.BYZANTINE_QUEUE_FAULT: 18,
|
| 41 |
+
TaskName.DISTRIBUTED_LOCK_STARVATION: 20,
|
| 42 |
+
TaskName.BACKPRESSURE_CASCADE: 20,
|
| 43 |
+
TaskName.ROUTE_PARTITION: 20,
|
| 44 |
+
TaskName.REGISTRY_CORRUPTION: 18,
|
| 45 |
+
TaskName.JOB_GENERATOR_RUNAWAY: 20,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
DEFAULT_CONFIGS = {
|
| 49 |
+
"auth": {"delay_ms": 200},
|
| 50 |
+
"gateway": {"auth_timeout_ms": 500},
|
| 51 |
+
"worker": {"db_pool_size": 10, "db_write_delay_ms": 0},
|
| 52 |
+
"job_generator": {"interval_ms": 333},
|
| 53 |
+
"blocked_routes": {"blocked": []},
|
| 54 |
+
}
|
server/env.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
import time
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from .constants import (
|
| 9 |
+
DEFAULT_CONFIGS,
|
| 10 |
+
NO_COMMAND_PROVIDED_SENTINEL,
|
| 11 |
+
TASK_MAX_STEPS,
|
| 12 |
+
TaskName,
|
| 13 |
+
)
|
| 14 |
+
from .fault_injector import inject_fault
|
| 15 |
+
from .graders import grade_task
|
| 16 |
+
from .metrics_poller import MetricsPoller
|
| 17 |
+
from .models import Action, Observation, StepResult
|
| 18 |
+
from .process_manager import ProcessManager
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class DistributedDebugEnv:
|
| 22 |
+
"""OpenEnv-compatible distributed systems debugging environment."""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self, project_root: Path | None = None, mesh_root: Path | None = None
|
| 26 |
+
) -> None:
|
| 27 |
+
self.project_root = (
|
| 28 |
+
project_root or Path(__file__).resolve().parent.parent
|
| 29 |
+
).resolve()
|
| 30 |
+
self.mesh_root = (
|
| 31 |
+
mesh_root or Path(os.getenv("MESH_ROOT", self.project_root / "mesh"))
|
| 32 |
+
).resolve()
|
| 33 |
+
|
| 34 |
+
self._process_manager = ProcessManager(
|
| 35 |
+
project_root=self.project_root, mesh_root=self.mesh_root
|
| 36 |
+
)
|
| 37 |
+
self._metrics_poller = MetricsPoller(poll_interval_s=2.0)
|
| 38 |
+
|
| 39 |
+
self.current_task: TaskName | None = None
|
| 40 |
+
self.max_steps: int = 0
|
| 41 |
+
self.step_count: int = 0
|
| 42 |
+
self.last_exit_code: int = 0
|
| 43 |
+
self.prev_observation: Observation | None = None
|
| 44 |
+
self._baselines: dict[str, int] = {
|
| 45 |
+
"baseline_worker_restart_count": 0,
|
| 46 |
+
"baseline_consumer_stall_count": 0,
|
| 47 |
+
}
|
| 48 |
+
self._seen_diagnostic_signatures: set[str] = set()
|
| 49 |
+
self._command_counts: dict[str, int] = {}
|
| 50 |
+
self._last_grader_score: float = 0.0
|
| 51 |
+
|
| 52 |
+
def start(self) -> None:
|
| 53 |
+
if not self._metrics_poller.is_alive():
|
| 54 |
+
self._metrics_poller.start()
|
| 55 |
+
|
| 56 |
+
def close(self) -> None:
|
| 57 |
+
self._metrics_poller.stop()
|
| 58 |
+
|
| 59 |
+
def _write_json(self, path: Path, payload: dict[str, Any]) -> None:
|
| 60 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 61 |
+
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
| 62 |
+
|
| 63 |
+
def _restore_defaults(self) -> None:
|
| 64 |
+
self._write_json(
|
| 65 |
+
self.mesh_root / "registry.json",
|
| 66 |
+
{
|
| 67 |
+
"services": {
|
| 68 |
+
"auth": {"host": "localhost", "port": 3001, "protocol": "http"},
|
| 69 |
+
"redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
|
| 70 |
+
"worker": {
|
| 71 |
+
"host": "localhost",
|
| 72 |
+
"port": None,
|
| 73 |
+
"protocol": "internal",
|
| 74 |
+
},
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
)
|
| 78 |
+
self._write_json(
|
| 79 |
+
self.mesh_root / "auth" / "config.json", DEFAULT_CONFIGS["auth"]
|
| 80 |
+
)
|
| 81 |
+
self._write_json(
|
| 82 |
+
self.mesh_root / "gateway" / "config.json", DEFAULT_CONFIGS["gateway"]
|
| 83 |
+
)
|
| 84 |
+
self._write_json(
|
| 85 |
+
self.mesh_root / "gateway" / "blocked_routes.json",
|
| 86 |
+
DEFAULT_CONFIGS["blocked_routes"],
|
| 87 |
+
)
|
| 88 |
+
self._write_json(
|
| 89 |
+
self.mesh_root / "worker" / "config.json", DEFAULT_CONFIGS["worker"]
|
| 90 |
+
)
|
| 91 |
+
self._write_json(
|
| 92 |
+
self.mesh_root / "worker" / "job_generator_config.json",
|
| 93 |
+
DEFAULT_CONFIGS["job_generator"],
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
def _truncate_logs(self) -> None:
|
| 97 |
+
for service in ["gateway", "auth", "worker", "job_gen"]:
|
| 98 |
+
Path(f"/tmp/{service}.log").write_text("", encoding="utf-8")
|
| 99 |
+
|
| 100 |
+
def _reset_runtime_counters(self) -> None:
|
| 101 |
+
Path("/tmp/worker_restart_count").write_text("0", encoding="utf-8")
|
| 102 |
+
Path("/tmp/consumer_stall_count").write_text("0", encoding="utf-8")
|
| 103 |
+
|
| 104 |
+
def _redis_flush(self) -> None:
|
| 105 |
+
subprocess.run(
|
| 106 |
+
["redis-cli", "FLUSHDB"], check=True, capture_output=True, text=True
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def _read_float(self, value: str, default: float = 0.0) -> float:
|
| 110 |
+
try:
|
| 111 |
+
return float(value)
|
| 112 |
+
except (TypeError, ValueError):
|
| 113 |
+
return default
|
| 114 |
+
|
| 115 |
+
def _is_route_blocked(self) -> bool:
|
| 116 |
+
blocked_file = self.mesh_root / "gateway" / "blocked_routes.json"
|
| 117 |
+
try:
|
| 118 |
+
payload = json.loads(blocked_file.read_text(encoding="utf-8"))
|
| 119 |
+
blocked = payload.get("blocked", [])
|
| 120 |
+
return "gateway->redis" in blocked
|
| 121 |
+
except Exception:
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
def _is_lock_present(self) -> bool:
|
| 125 |
+
result = subprocess.run(
|
| 126 |
+
["redis-cli", "EXISTS", "LOCK:job_processor"],
|
| 127 |
+
capture_output=True,
|
| 128 |
+
text=True,
|
| 129 |
+
timeout=2,
|
| 130 |
+
check=False,
|
| 131 |
+
)
|
| 132 |
+
return result.stdout.strip() == "1"
|
| 133 |
+
|
| 134 |
+
def _is_cascading_timeout_resolved(self) -> bool:
|
| 135 |
+
auth_config_file = self.mesh_root / "auth" / "config.json"
|
| 136 |
+
gateway_config_file = self.mesh_root / "gateway" / "config.json"
|
| 137 |
+
try:
|
| 138 |
+
auth_payload = json.loads(auth_config_file.read_text(encoding="utf-8"))
|
| 139 |
+
gateway_payload = json.loads(
|
| 140 |
+
gateway_config_file.read_text(encoding="utf-8")
|
| 141 |
+
)
|
| 142 |
+
except Exception:
|
| 143 |
+
return False
|
| 144 |
+
|
| 145 |
+
auth_delay_ms = self._read_float(auth_payload.get("delay_ms"), default=0.0)
|
| 146 |
+
auth_timeout_ms = self._read_float(
|
| 147 |
+
gateway_payload.get("auth_timeout_ms"), default=0.0
|
| 148 |
+
)
|
| 149 |
+
if auth_timeout_ms <= 0:
|
| 150 |
+
return False
|
| 151 |
+
return auth_delay_ms <= auth_timeout_ms
|
| 152 |
+
|
| 153 |
+
def _is_registry_auth_default(self) -> bool:
|
| 154 |
+
registry_file = self.mesh_root / "registry.json"
|
| 155 |
+
try:
|
| 156 |
+
payload = json.loads(registry_file.read_text(encoding="utf-8"))
|
| 157 |
+
auth_service = payload["services"]["auth"]
|
| 158 |
+
except Exception:
|
| 159 |
+
return False
|
| 160 |
+
|
| 161 |
+
return (
|
| 162 |
+
auth_service.get("host") == "localhost"
|
| 163 |
+
and int(auth_service.get("port", 0)) == 3001
|
| 164 |
+
and auth_service.get("protocol") == "http"
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
def _job_generator_interval_ms(self) -> int:
|
| 168 |
+
config_file = self.mesh_root / "worker" / "job_generator_config.json"
|
| 169 |
+
try:
|
| 170 |
+
payload = json.loads(config_file.read_text(encoding="utf-8"))
|
| 171 |
+
except Exception:
|
| 172 |
+
return 0
|
| 173 |
+
|
| 174 |
+
try:
|
| 175 |
+
return int(payload.get("interval_ms", 0))
|
| 176 |
+
except (TypeError, ValueError):
|
| 177 |
+
return 0
|
| 178 |
+
|
| 179 |
+
def _is_job_generator_rate_resolved(self) -> bool:
|
| 180 |
+
return self._job_generator_interval_ms() >= int(
|
| 181 |
+
DEFAULT_CONFIGS["job_generator"]["interval_ms"]
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
def _build_grader_context(self) -> dict[str, Any]:
|
| 185 |
+
return {
|
| 186 |
+
**self._baselines,
|
| 187 |
+
"route_blocked": self._is_route_blocked(),
|
| 188 |
+
"lock_exists": self._is_lock_present(),
|
| 189 |
+
"cascading_timeout_resolved": self._is_cascading_timeout_resolved(),
|
| 190 |
+
"registry_auth_matches_default": self._is_registry_auth_default(),
|
| 191 |
+
"job_generator_interval_ms": self._job_generator_interval_ms(),
|
| 192 |
+
"job_generator_rate_resolved": self._is_job_generator_rate_resolved(),
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
def _blocked_command(self, command: str) -> bool:
|
| 196 |
+
dangerous_patterns = [
|
| 197 |
+
"rm -rf /",
|
| 198 |
+
"kill -9 1",
|
| 199 |
+
"pkill -f uvicorn",
|
| 200 |
+
"> /tmp/gateway.log",
|
| 201 |
+
"> /tmp/auth.log",
|
| 202 |
+
"> /tmp/worker.log",
|
| 203 |
+
]
|
| 204 |
+
normalized = command.strip().lower()
|
| 205 |
+
return any(pattern in normalized for pattern in dangerous_patterns)
|
| 206 |
+
|
| 207 |
+
def _run_command(self, command: str) -> tuple[str, str | None]:
|
| 208 |
+
if command.strip() == NO_COMMAND_PROVIDED_SENTINEL:
|
| 209 |
+
self.last_exit_code = 2
|
| 210 |
+
return (
|
| 211 |
+
"No command provided by model. Expected JSON with a command field.",
|
| 212 |
+
"no_command_provided",
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
if self._blocked_command(command):
|
| 216 |
+
self.last_exit_code = 1
|
| 217 |
+
return (
|
| 218 |
+
"BLOCKED: This command would damage the environment infrastructure.",
|
| 219 |
+
"blocked_command",
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
result = subprocess.run(
|
| 224 |
+
command,
|
| 225 |
+
shell=True,
|
| 226 |
+
capture_output=True,
|
| 227 |
+
text=True,
|
| 228 |
+
timeout=10,
|
| 229 |
+
cwd="/",
|
| 230 |
+
env={
|
| 231 |
+
**os.environ,
|
| 232 |
+
"PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
| 233 |
+
},
|
| 234 |
+
check=False,
|
| 235 |
+
)
|
| 236 |
+
self.last_exit_code = result.returncode
|
| 237 |
+
output = (result.stdout + result.stderr).strip() or "(no output)"
|
| 238 |
+
return output, None
|
| 239 |
+
except subprocess.TimeoutExpired:
|
| 240 |
+
self.last_exit_code = 124
|
| 241 |
+
return "Command timed out after 10 seconds.", "timeout"
|
| 242 |
+
except Exception as exc:
|
| 243 |
+
self.last_exit_code = 1
|
| 244 |
+
return f"Command execution error: {exc}", str(exc)
|
| 245 |
+
|
| 246 |
+
def _command_signature(self, command: str) -> str:
|
| 247 |
+
return " ".join(command.strip().lower().split())
|
| 248 |
+
|
| 249 |
+
def _is_diagnostic_command(self, command: str) -> bool:
|
| 250 |
+
diagnostic_keywords = [
|
| 251 |
+
"cat",
|
| 252 |
+
"curl",
|
| 253 |
+
"redis-cli",
|
| 254 |
+
"ps",
|
| 255 |
+
"ls",
|
| 256 |
+
"grep",
|
| 257 |
+
"tail",
|
| 258 |
+
"jq",
|
| 259 |
+
"lrange",
|
| 260 |
+
"llen",
|
| 261 |
+
"keys",
|
| 262 |
+
"ttl",
|
| 263 |
+
"get",
|
| 264 |
+
]
|
| 265 |
+
normalized = command.lower()
|
| 266 |
+
return any(keyword in normalized for keyword in diagnostic_keywords)
|
| 267 |
+
|
| 268 |
+
def _is_state_change_command(self, command: str) -> bool:
|
| 269 |
+
normalized = command.lower()
|
| 270 |
+
state_change_patterns = [
|
| 271 |
+
"kill -hup",
|
| 272 |
+
"redis-cli del",
|
| 273 |
+
"redis-cli lrem",
|
| 274 |
+
"redis-cli set",
|
| 275 |
+
"redis-cli flushdb",
|
| 276 |
+
"echo '{",
|
| 277 |
+
"> /mesh/",
|
| 278 |
+
"tee /mesh/",
|
| 279 |
+
]
|
| 280 |
+
return any(pattern in normalized for pattern in state_change_patterns)
|
| 281 |
+
|
| 282 |
+
def _compute_reward(
|
| 283 |
+
self,
|
| 284 |
+
command: str,
|
| 285 |
+
current: Observation,
|
| 286 |
+
previous: Observation,
|
| 287 |
+
grader_score: float,
|
| 288 |
+
previous_grader_score: float,
|
| 289 |
+
command_error: str | None,
|
| 290 |
+
) -> float:
|
| 291 |
+
if command_error == "no_command_provided":
|
| 292 |
+
return 0.0
|
| 293 |
+
|
| 294 |
+
if grader_score >= 0.95:
|
| 295 |
+
return 1.0
|
| 296 |
+
|
| 297 |
+
reward = grader_score * 0.75
|
| 298 |
+
signature = self._command_signature(command)
|
| 299 |
+
signature_count = self._command_counts.get(signature, 0) + 1
|
| 300 |
+
self._command_counts[signature] = signature_count
|
| 301 |
+
|
| 302 |
+
if (
|
| 303 |
+
self._is_diagnostic_command(command)
|
| 304 |
+
and signature not in self._seen_diagnostic_signatures
|
| 305 |
+
):
|
| 306 |
+
reward += 0.02
|
| 307 |
+
self._seen_diagnostic_signatures.add(signature)
|
| 308 |
+
|
| 309 |
+
if self._is_state_change_command(command):
|
| 310 |
+
reward += 0.03
|
| 311 |
+
|
| 312 |
+
if grader_score > previous_grader_score + 1e-4:
|
| 313 |
+
reward += 0.15
|
| 314 |
+
else:
|
| 315 |
+
reward -= 0.05
|
| 316 |
+
|
| 317 |
+
if (
|
| 318 |
+
current.metrics.gateway_success_rate
|
| 319 |
+
> previous.metrics.gateway_success_rate + 1e-3
|
| 320 |
+
):
|
| 321 |
+
reward += 0.05
|
| 322 |
+
|
| 323 |
+
if current.metrics.queue_depth < previous.metrics.queue_depth:
|
| 324 |
+
reward += 0.05
|
| 325 |
+
|
| 326 |
+
if current.metrics.worker_restart_count < previous.metrics.worker_restart_count:
|
| 327 |
+
reward += 0.03
|
| 328 |
+
|
| 329 |
+
if current.metrics.consumer_stall_count < previous.metrics.consumer_stall_count:
|
| 330 |
+
reward += 0.03
|
| 331 |
+
|
| 332 |
+
if signature_count > 1:
|
| 333 |
+
reward -= min(0.12, 0.04 * (signature_count - 1))
|
| 334 |
+
|
| 335 |
+
if command.strip().lower() in {
|
| 336 |
+
"echo",
|
| 337 |
+
"pwd",
|
| 338 |
+
"whoami",
|
| 339 |
+
"date",
|
| 340 |
+
"true",
|
| 341 |
+
"false",
|
| 342 |
+
}:
|
| 343 |
+
reward -= 0.08
|
| 344 |
+
|
| 345 |
+
if self.last_exit_code != 0 and command_error not in {
|
| 346 |
+
"blocked_command",
|
| 347 |
+
"no_command_provided",
|
| 348 |
+
}:
|
| 349 |
+
reward -= 0.08
|
| 350 |
+
|
| 351 |
+
if command_error == "blocked_command":
|
| 352 |
+
reward -= 0.25
|
| 353 |
+
|
| 354 |
+
return max(0.0, min(1.0, reward))
|
| 355 |
+
|
| 356 |
+
def _status_block(self, metrics: Any) -> str:
|
| 357 |
+
return (
|
| 358 |
+
"=== pipeline status after reset ===\n"
|
| 359 |
+
"gateway: running\n"
|
| 360 |
+
"auth: running\n"
|
| 361 |
+
"worker: running\n"
|
| 362 |
+
f"queue_depth: {metrics.queue_depth}\n"
|
| 363 |
+
f"gateway_success_rate: {metrics.gateway_success_rate:.2f}"
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
def reset(self, task_name: TaskName | str) -> Observation:
|
| 367 |
+
task = TaskName.parse(task_name) if isinstance(task_name, str) else task_name
|
| 368 |
+
|
| 369 |
+
self.current_task = task
|
| 370 |
+
self.max_steps = TASK_MAX_STEPS[task]
|
| 371 |
+
self.step_count = 0
|
| 372 |
+
self._seen_diagnostic_signatures = set()
|
| 373 |
+
self._command_counts = {}
|
| 374 |
+
self._last_grader_score = 0.0
|
| 375 |
+
|
| 376 |
+
self._truncate_logs()
|
| 377 |
+
self._restore_defaults()
|
| 378 |
+
self._redis_flush()
|
| 379 |
+
self._reset_runtime_counters()
|
| 380 |
+
|
| 381 |
+
Path("/tmp/current_task").write_text(task.value, encoding="utf-8")
|
| 382 |
+
|
| 383 |
+
self._process_manager.restart_all()
|
| 384 |
+
if not self._process_manager.wait_healthy(timeout_s=30):
|
| 385 |
+
raise RuntimeError("Services failed health checks after reset")
|
| 386 |
+
|
| 387 |
+
inject_fault(task, self._process_manager)
|
| 388 |
+
time.sleep(1.0)
|
| 389 |
+
|
| 390 |
+
self._metrics_poller.poll_once()
|
| 391 |
+
metrics = self._metrics_poller.get_current_metrics()
|
| 392 |
+
|
| 393 |
+
self._baselines = {
|
| 394 |
+
"baseline_worker_restart_count": metrics.worker_restart_count,
|
| 395 |
+
"baseline_consumer_stall_count": metrics.consumer_stall_count,
|
| 396 |
+
}
|
| 397 |
+
self._last_grader_score = grade_task(
|
| 398 |
+
task, metrics, self._build_grader_context()
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
observation = Observation(
|
| 402 |
+
command_output=self._status_block(metrics),
|
| 403 |
+
metrics=metrics,
|
| 404 |
+
process_status=self._process_manager.get_status(),
|
| 405 |
+
)
|
| 406 |
+
self.prev_observation = observation
|
| 407 |
+
return observation
|
| 408 |
+
|
| 409 |
+
def step(self, action: Action) -> StepResult:
|
| 410 |
+
if not self.current_task:
|
| 411 |
+
raise RuntimeError(
|
| 412 |
+
"Environment not initialized. Call reset(task_name) first."
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
self.step_count += 1
|
| 416 |
+
command_output, command_error = self._run_command(action.command)
|
| 417 |
+
|
| 418 |
+
self._metrics_poller.poll_once()
|
| 419 |
+
metrics = self._metrics_poller.get_current_metrics()
|
| 420 |
+
|
| 421 |
+
observation = Observation(
|
| 422 |
+
command_output=command_output,
|
| 423 |
+
metrics=metrics,
|
| 424 |
+
process_status=self._process_manager.get_status(),
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
+
previous = self.prev_observation or observation
|
| 428 |
+
previous_grader_score = self._last_grader_score
|
| 429 |
+
grader_score = grade_task(
|
| 430 |
+
self.current_task, metrics, self._build_grader_context()
|
| 431 |
+
)
|
| 432 |
+
reward = self._compute_reward(
|
| 433 |
+
action.command,
|
| 434 |
+
observation,
|
| 435 |
+
previous,
|
| 436 |
+
grader_score,
|
| 437 |
+
previous_grader_score,
|
| 438 |
+
command_error,
|
| 439 |
+
)
|
| 440 |
+
if command_error == "no_command_provided":
|
| 441 |
+
done = self.step_count >= self.max_steps
|
| 442 |
+
else:
|
| 443 |
+
done = grader_score >= 0.95 or self.step_count >= self.max_steps
|
| 444 |
+
|
| 445 |
+
self._last_grader_score = grader_score
|
| 446 |
+
self.prev_observation = observation
|
| 447 |
+
|
| 448 |
+
info: dict[str, Any] = {
|
| 449 |
+
"grader_score": round(grader_score, 4),
|
| 450 |
+
"error": command_error,
|
| 451 |
+
"exit_code": self.last_exit_code,
|
| 452 |
+
"task": self.current_task.value if self.current_task else None,
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
return StepResult(observation=observation, reward=reward, done=done, info=info)
|
| 456 |
+
|
| 457 |
+
def state(self) -> dict[str, Any]:
|
| 458 |
+
self._metrics_poller.poll_once()
|
| 459 |
+
metrics = self._metrics_poller.get_current_metrics()
|
| 460 |
+
return {
|
| 461 |
+
"task": self.current_task.value if self.current_task else None,
|
| 462 |
+
"step_count": self.step_count,
|
| 463 |
+
"max_steps": self.max_steps,
|
| 464 |
+
"metrics": metrics.model_dump(),
|
| 465 |
+
"process_status": self._process_manager.get_status(),
|
| 466 |
+
"baselines": dict(self._baselines),
|
| 467 |
+
}
|
server/fault_injector.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import subprocess
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from .constants import TaskName
|
| 6 |
+
from .process_manager import ProcessManager
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _write_json(path: Path, payload: dict) -> None:
|
| 10 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 11 |
+
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def inject_cascading_timeout(pm: ProcessManager) -> None:
|
| 15 |
+
_write_json(pm.mesh_root / "auth" / "config.json", {"delay_ms": 1500})
|
| 16 |
+
_write_json(pm.mesh_root / "gateway" / "config.json", {"auth_timeout_ms": 500})
|
| 17 |
+
pm.sighup("auth")
|
| 18 |
+
pm.sighup("gateway")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def inject_byzantine_queue_fault(pm: ProcessManager) -> None:
|
| 22 |
+
subprocess.run(
|
| 23 |
+
["redis-cli", "LPUSH", "job_queue", '{"id":"poison-001","payload":{{BROKEN'],
|
| 24 |
+
check=True,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def inject_distributed_lock_starvation(pm: ProcessManager) -> None:
|
| 29 |
+
subprocess.run(
|
| 30 |
+
["redis-cli", "SET", "LOCK:job_processor", "dead-worker-pid-9999"], check=True
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def inject_backpressure_cascade(pm: ProcessManager) -> None:
|
| 35 |
+
_write_json(
|
| 36 |
+
pm.mesh_root / "worker" / "config.json",
|
| 37 |
+
{"db_pool_size": 1, "db_write_delay_ms": 800},
|
| 38 |
+
)
|
| 39 |
+
pm.sighup("worker")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def inject_route_partition(pm: ProcessManager) -> None:
|
| 43 |
+
_write_json(
|
| 44 |
+
pm.mesh_root / "gateway" / "blocked_routes.json",
|
| 45 |
+
{"blocked": ["gateway->redis"]},
|
| 46 |
+
)
|
| 47 |
+
pm.sighup("gateway")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def inject_registry_corruption(pm: ProcessManager) -> None:
|
| 51 |
+
_write_json(
|
| 52 |
+
pm.mesh_root / "registry.json",
|
| 53 |
+
{
|
| 54 |
+
"services": {
|
| 55 |
+
"auth": {"host": "invalid-auth-host", "port": 3001, "protocol": "http"},
|
| 56 |
+
"redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
|
| 57 |
+
"worker": {"host": "localhost", "port": None, "protocol": "internal"},
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
)
|
| 61 |
+
pm.sighup("gateway")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def inject_job_generator_runaway(pm: ProcessManager) -> None:
|
| 65 |
+
_write_json(
|
| 66 |
+
pm.mesh_root / "worker" / "job_generator_config.json", {"interval_ms": 10}
|
| 67 |
+
)
|
| 68 |
+
pm.sighup("job_generator")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def inject_fault(task_name: TaskName | str, pm: ProcessManager) -> None:
|
| 72 |
+
task = TaskName.parse(task_name) if isinstance(task_name, str) else task_name
|
| 73 |
+
|
| 74 |
+
if task is TaskName.CASCADING_TIMEOUT:
|
| 75 |
+
inject_cascading_timeout(pm)
|
| 76 |
+
return
|
| 77 |
+
if task is TaskName.BYZANTINE_QUEUE_FAULT:
|
| 78 |
+
inject_byzantine_queue_fault(pm)
|
| 79 |
+
return
|
| 80 |
+
if task is TaskName.DISTRIBUTED_LOCK_STARVATION:
|
| 81 |
+
inject_distributed_lock_starvation(pm)
|
| 82 |
+
return
|
| 83 |
+
if task is TaskName.BACKPRESSURE_CASCADE:
|
| 84 |
+
inject_backpressure_cascade(pm)
|
| 85 |
+
return
|
| 86 |
+
if task is TaskName.ROUTE_PARTITION:
|
| 87 |
+
inject_route_partition(pm)
|
| 88 |
+
return
|
| 89 |
+
if task is TaskName.REGISTRY_CORRUPTION:
|
| 90 |
+
inject_registry_corruption(pm)
|
| 91 |
+
return
|
| 92 |
+
if task is TaskName.JOB_GENERATOR_RUNAWAY:
|
| 93 |
+
inject_job_generator_runaway(pm)
|
| 94 |
+
return
|
| 95 |
+
raise ValueError(f"Unknown task: {task_name}")
|
server/graders.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
from .constants import TaskName
|
| 4 |
+
from .models import SystemMetrics
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _clamp(score: float) -> float:
|
| 8 |
+
return max(0.0, min(1.0, score))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def grade_cascading_timeout(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 12 |
+
timeout_resolved = bool(context.get("cascading_timeout_resolved", False))
|
| 13 |
+
if timeout_resolved and metrics.gateway_success_rate >= 0.99:
|
| 14 |
+
return 1.0
|
| 15 |
+
if not timeout_resolved:
|
| 16 |
+
# Prevent instant pass while the injected timeout fault is still active.
|
| 17 |
+
return _clamp(metrics.gateway_success_rate * 0.25)
|
| 18 |
+
return _clamp(0.4 + metrics.gateway_success_rate * 0.4)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def grade_byzantine_queue_fault(
|
| 22 |
+
metrics: SystemMetrics, context: dict[str, Any]
|
| 23 |
+
) -> float:
|
| 24 |
+
baseline_restart = int(context.get("baseline_worker_restart_count", 0))
|
| 25 |
+
restart_delta = max(0, metrics.worker_restart_count - baseline_restart)
|
| 26 |
+
|
| 27 |
+
if metrics.queue_depth == 0 and restart_delta <= 1:
|
| 28 |
+
return 1.0
|
| 29 |
+
if metrics.queue_depth == 0:
|
| 30 |
+
return 0.6
|
| 31 |
+
|
| 32 |
+
queue_component = max(0.0, 1.0 - metrics.queue_depth / 50.0)
|
| 33 |
+
stability_penalty = min(0.4, restart_delta * 0.05)
|
| 34 |
+
return _clamp(queue_component - stability_penalty)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def grade_distributed_lock_starvation(
|
| 38 |
+
metrics: SystemMetrics, context: dict[str, Any]
|
| 39 |
+
) -> float:
|
| 40 |
+
lock_exists = bool(context.get("lock_exists", True))
|
| 41 |
+
baseline_stall = int(context.get("baseline_consumer_stall_count", 0))
|
| 42 |
+
stall_delta = max(0, metrics.consumer_stall_count - baseline_stall)
|
| 43 |
+
|
| 44 |
+
if not lock_exists and metrics.queue_depth <= 3:
|
| 45 |
+
return 1.0
|
| 46 |
+
if not lock_exists:
|
| 47 |
+
return 0.6
|
| 48 |
+
|
| 49 |
+
# If lock still exists, reward slight progress only when stalls don't explode.
|
| 50 |
+
return 0.2 if stall_delta <= 1 else 0.0
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def grade_backpressure_cascade(metrics: SystemMetrics, _: dict[str, Any]) -> float:
|
| 54 |
+
return _clamp(1.0 - (metrics.queue_depth / 200.0))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def grade_route_partition(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 58 |
+
route_blocked = bool(context.get("route_blocked", True))
|
| 59 |
+
if not route_blocked and metrics.gateway_success_rate >= 0.95:
|
| 60 |
+
return 1.0
|
| 61 |
+
if not route_blocked:
|
| 62 |
+
return _clamp(metrics.gateway_success_rate)
|
| 63 |
+
return 0.0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def grade_registry_corruption(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 67 |
+
registry_auth_matches_default = bool(
|
| 68 |
+
context.get("registry_auth_matches_default", False)
|
| 69 |
+
)
|
| 70 |
+
if registry_auth_matches_default and metrics.gateway_success_rate >= 0.99:
|
| 71 |
+
return 1.0
|
| 72 |
+
if registry_auth_matches_default:
|
| 73 |
+
return _clamp(0.5 + metrics.gateway_success_rate * 0.5)
|
| 74 |
+
return _clamp(metrics.gateway_success_rate * 0.3)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def grade_job_generator_runaway(
|
| 78 |
+
metrics: SystemMetrics, context: dict[str, Any]
|
| 79 |
+
) -> float:
|
| 80 |
+
rate_resolved = bool(context.get("job_generator_rate_resolved", False))
|
| 81 |
+
if rate_resolved and metrics.queue_depth <= 5:
|
| 82 |
+
return 1.0
|
| 83 |
+
if rate_resolved and metrics.queue_depth <= 30:
|
| 84 |
+
return 0.7
|
| 85 |
+
if rate_resolved:
|
| 86 |
+
return _clamp(0.7 - (metrics.queue_depth - 30) / 100.0)
|
| 87 |
+
return 0.2 if metrics.queue_depth <= 30 else 0.0
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def grade_task(
|
| 91 |
+
task_name: TaskName | str, metrics: SystemMetrics, context: dict[str, Any]
|
| 92 |
+
) -> float:
|
| 93 |
+
task = TaskName.parse(task_name) if isinstance(task_name, str) else task_name
|
| 94 |
+
|
| 95 |
+
if task is TaskName.CASCADING_TIMEOUT:
|
| 96 |
+
return grade_cascading_timeout(metrics, context)
|
| 97 |
+
if task is TaskName.BYZANTINE_QUEUE_FAULT:
|
| 98 |
+
return grade_byzantine_queue_fault(metrics, context)
|
| 99 |
+
if task is TaskName.DISTRIBUTED_LOCK_STARVATION:
|
| 100 |
+
return grade_distributed_lock_starvation(metrics, context)
|
| 101 |
+
if task is TaskName.BACKPRESSURE_CASCADE:
|
| 102 |
+
return grade_backpressure_cascade(metrics, context)
|
| 103 |
+
if task is TaskName.ROUTE_PARTITION:
|
| 104 |
+
return grade_route_partition(metrics, context)
|
| 105 |
+
if task is TaskName.REGISTRY_CORRUPTION:
|
| 106 |
+
return grade_registry_corruption(metrics, context)
|
| 107 |
+
if task is TaskName.JOB_GENERATOR_RUNAWAY:
|
| 108 |
+
return grade_job_generator_runaway(metrics, context)
|
| 109 |
+
return 0.0
|
server/metrics_poller.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import threading
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
from .models import SystemMetrics
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MetricsPoller(threading.Thread):
|
| 11 |
+
"""Background metrics poller with last-known-good caching."""
|
| 12 |
+
|
| 13 |
+
def __init__(self, poll_interval_s: float = 2.0) -> None:
|
| 14 |
+
super().__init__(daemon=True)
|
| 15 |
+
self.poll_interval_s = poll_interval_s
|
| 16 |
+
self._stop_event = threading.Event()
|
| 17 |
+
self._lock = threading.Lock()
|
| 18 |
+
self._latest: dict[str, float | int] = {
|
| 19 |
+
"gateway_success_rate": 0.0,
|
| 20 |
+
"gateway_p99_latency_ms": 0.0,
|
| 21 |
+
"queue_depth": 0,
|
| 22 |
+
"worker_restart_count": 0,
|
| 23 |
+
"consumer_stall_count": 0,
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def stop(self) -> None:
|
| 27 |
+
self._stop_event.set()
|
| 28 |
+
|
| 29 |
+
def _read_counter(self, path: str) -> int:
|
| 30 |
+
file_path = Path(path)
|
| 31 |
+
if not file_path.exists():
|
| 32 |
+
return 0
|
| 33 |
+
try:
|
| 34 |
+
return int(file_path.read_text().strip() or "0")
|
| 35 |
+
except ValueError:
|
| 36 |
+
return 0
|
| 37 |
+
|
| 38 |
+
def _poll_gateway(self) -> dict[str, float]:
|
| 39 |
+
with httpx.Client(timeout=1.0) as client:
|
| 40 |
+
response = client.get("http://localhost:3000/health")
|
| 41 |
+
response.raise_for_status()
|
| 42 |
+
payload = response.json()
|
| 43 |
+
|
| 44 |
+
success_rate = float(
|
| 45 |
+
payload.get("success_rate", payload.get("gateway_success_rate", 0.0))
|
| 46 |
+
)
|
| 47 |
+
p99 = float(
|
| 48 |
+
payload.get("p99_latency_ms", payload.get("gateway_p99_latency_ms", 0.0))
|
| 49 |
+
)
|
| 50 |
+
return {
|
| 51 |
+
"gateway_success_rate": max(0.0, min(1.0, success_rate)),
|
| 52 |
+
"gateway_p99_latency_ms": max(0.0, p99),
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
def _poll_queue_depth(self) -> int:
|
| 56 |
+
result = subprocess.run(
|
| 57 |
+
["redis-cli", "LLEN", "job_queue"],
|
| 58 |
+
capture_output=True,
|
| 59 |
+
text=True,
|
| 60 |
+
timeout=2,
|
| 61 |
+
check=False,
|
| 62 |
+
)
|
| 63 |
+
if result.returncode != 0:
|
| 64 |
+
return int(self._latest["queue_depth"])
|
| 65 |
+
try:
|
| 66 |
+
return max(0, int(result.stdout.strip() or "0"))
|
| 67 |
+
except ValueError:
|
| 68 |
+
return int(self._latest["queue_depth"])
|
| 69 |
+
|
| 70 |
+
def poll_once(self) -> None:
|
| 71 |
+
snapshot = dict(self._latest)
|
| 72 |
+
try:
|
| 73 |
+
snapshot.update(self._poll_gateway())
|
| 74 |
+
except Exception:
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
snapshot["queue_depth"] = self._poll_queue_depth()
|
| 78 |
+
snapshot["worker_restart_count"] = self._read_counter(
|
| 79 |
+
"/tmp/worker_restart_count"
|
| 80 |
+
)
|
| 81 |
+
snapshot["consumer_stall_count"] = self._read_counter(
|
| 82 |
+
"/tmp/consumer_stall_count"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
with self._lock:
|
| 86 |
+
self._latest = snapshot
|
| 87 |
+
|
| 88 |
+
def run(self) -> None:
|
| 89 |
+
while not self._stop_event.is_set():
|
| 90 |
+
self.poll_once()
|
| 91 |
+
self._stop_event.wait(self.poll_interval_s)
|
| 92 |
+
|
| 93 |
+
def get_current_metrics(self) -> SystemMetrics:
|
| 94 |
+
with self._lock:
|
| 95 |
+
snapshot = dict(self._latest)
|
| 96 |
+
return SystemMetrics.model_validate(snapshot)
|
server/models.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field, field_validator
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SystemMetrics(BaseModel):
|
| 7 |
+
gateway_success_rate: float = Field(..., ge=0.0, le=1.0)
|
| 8 |
+
gateway_p99_latency_ms: float = Field(..., ge=0.0)
|
| 9 |
+
queue_depth: int = Field(..., ge=0)
|
| 10 |
+
worker_restart_count: int = Field(..., ge=0)
|
| 11 |
+
consumer_stall_count: int = Field(..., ge=0)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Observation(BaseModel):
|
| 15 |
+
command_output: str = Field(
|
| 16 |
+
..., description="stdout+stderr from the last executed command"
|
| 17 |
+
)
|
| 18 |
+
metrics: SystemMetrics
|
| 19 |
+
process_status: dict[str, str] = Field(default_factory=dict)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class Action(BaseModel):
|
| 23 |
+
command: str = Field(..., description="Single bash command to execute")
|
| 24 |
+
|
| 25 |
+
@field_validator("command")
|
| 26 |
+
@classmethod
|
| 27 |
+
def command_must_not_be_empty(cls, value: str) -> str:
|
| 28 |
+
if not value.strip():
|
| 29 |
+
raise ValueError("command must not be empty")
|
| 30 |
+
return value
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class Reward(BaseModel):
|
| 34 |
+
value: float = Field(..., ge=0.0, le=1.0)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class StepResult(BaseModel):
|
| 38 |
+
observation: Observation
|
| 39 |
+
reward: float = Field(..., ge=0.0, le=1.0)
|
| 40 |
+
done: bool
|
| 41 |
+
info: dict[str, Any] = Field(default_factory=dict)
|
server/process_manager.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import signal
|
| 3 |
+
import subprocess
|
| 4 |
+
import time
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ProcessManager:
|
| 11 |
+
"""Tracks Bun service processes and lifecycle state."""
|
| 12 |
+
|
| 13 |
+
def __init__(
|
| 14 |
+
self, project_root: Path | None = None, mesh_root: Path | None = None
|
| 15 |
+
) -> None:
|
| 16 |
+
self.project_root = (project_root or Path(__file__).resolve().parent.parent).resolve()
|
| 17 |
+
self.mesh_root = (
|
| 18 |
+
mesh_root or Path(os.getenv("MESH_ROOT", self.project_root / "mesh"))
|
| 19 |
+
).resolve()
|
| 20 |
+
|
| 21 |
+
self._service_scripts = {
|
| 22 |
+
"gateway": self.project_root / "mesh" / "gateway" / "index.ts",
|
| 23 |
+
"auth": self.project_root / "mesh" / "auth" / "index.ts",
|
| 24 |
+
"worker": self.project_root / "mesh" / "worker" / "index.ts",
|
| 25 |
+
}
|
| 26 |
+
self._job_generator_script = (
|
| 27 |
+
self.project_root / "mesh" / "worker" / "job_generator.ts"
|
| 28 |
+
)
|
| 29 |
+
self._health_urls = {
|
| 30 |
+
"gateway": "http://localhost:3000/health",
|
| 31 |
+
"auth": "http://localhost:3001/health",
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
self._processes: dict[str, subprocess.Popen[str]] = {}
|
| 35 |
+
self._log_handles: dict[str, object] = {}
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def _pid_path(service: str) -> Path:
|
| 39 |
+
return Path(f"/tmp/{service}.pid")
|
| 40 |
+
|
| 41 |
+
@staticmethod
|
| 42 |
+
def _is_pid_alive(pid: int) -> bool:
|
| 43 |
+
try:
|
| 44 |
+
os.kill(pid, 0)
|
| 45 |
+
return True
|
| 46 |
+
except OSError:
|
| 47 |
+
return False
|
| 48 |
+
|
| 49 |
+
def _read_pid(self, service: str) -> int | None:
|
| 50 |
+
path = self._pid_path(service)
|
| 51 |
+
if not path.exists():
|
| 52 |
+
return None
|
| 53 |
+
try:
|
| 54 |
+
pid = int(path.read_text().strip())
|
| 55 |
+
except (TypeError, ValueError):
|
| 56 |
+
return None
|
| 57 |
+
return pid if self._is_pid_alive(pid) else None
|
| 58 |
+
|
| 59 |
+
def _write_pid(self, service: str, pid: int) -> None:
|
| 60 |
+
self._pid_path(service).write_text(str(pid))
|
| 61 |
+
|
| 62 |
+
def _spawn_service(self, service: str, script: Path, log_path: Path) -> None:
|
| 63 |
+
log_handle = open(log_path, "a", encoding="utf-8")
|
| 64 |
+
env = {
|
| 65 |
+
**os.environ,
|
| 66 |
+
"MESH_ROOT": str(self.mesh_root),
|
| 67 |
+
}
|
| 68 |
+
process = subprocess.Popen(
|
| 69 |
+
["bun", "run", str(script)],
|
| 70 |
+
cwd=str(self.project_root),
|
| 71 |
+
stdout=log_handle,
|
| 72 |
+
stderr=subprocess.STDOUT,
|
| 73 |
+
text=True,
|
| 74 |
+
env=env,
|
| 75 |
+
)
|
| 76 |
+
self._processes[service] = process
|
| 77 |
+
self._log_handles[service] = log_handle
|
| 78 |
+
self._write_pid(service, process.pid)
|
| 79 |
+
|
| 80 |
+
def start_all(self) -> None:
|
| 81 |
+
for service, script in self._service_scripts.items():
|
| 82 |
+
existing_pid = self._read_pid(service)
|
| 83 |
+
if existing_pid:
|
| 84 |
+
continue
|
| 85 |
+
self._spawn_service(service, script, Path(f"/tmp/{service}.log"))
|
| 86 |
+
|
| 87 |
+
if not self._read_pid("job_generator"):
|
| 88 |
+
self._spawn_service(
|
| 89 |
+
"job_generator", self._job_generator_script, Path("/tmp/job_gen.log")
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
def _terminate_pid(self, pid: int, timeout_s: float = 0.5) -> None:
|
| 93 |
+
try:
|
| 94 |
+
os.kill(pid, signal.SIGTERM)
|
| 95 |
+
except ProcessLookupError:
|
| 96 |
+
return
|
| 97 |
+
|
| 98 |
+
deadline = time.time() + timeout_s
|
| 99 |
+
while time.time() < deadline:
|
| 100 |
+
if not self._is_pid_alive(pid):
|
| 101 |
+
return
|
| 102 |
+
time.sleep(0.05)
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
os.kill(pid, signal.SIGKILL)
|
| 106 |
+
except ProcessLookupError:
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
def stop_all(self) -> None:
|
| 110 |
+
for service in ["gateway", "auth", "worker", "job_generator"]:
|
| 111 |
+
pid = self._read_pid(service)
|
| 112 |
+
if pid:
|
| 113 |
+
self._terminate_pid(pid)
|
| 114 |
+
pid_path = self._pid_path(service)
|
| 115 |
+
if pid_path.exists():
|
| 116 |
+
pid_path.unlink(missing_ok=True)
|
| 117 |
+
|
| 118 |
+
for handle in self._log_handles.values():
|
| 119 |
+
try:
|
| 120 |
+
handle.close()
|
| 121 |
+
except Exception:
|
| 122 |
+
pass
|
| 123 |
+
|
| 124 |
+
self._processes.clear()
|
| 125 |
+
self._log_handles.clear()
|
| 126 |
+
|
| 127 |
+
def restart_all(self) -> None:
|
| 128 |
+
self.stop_all()
|
| 129 |
+
self.start_all()
|
| 130 |
+
|
| 131 |
+
def sighup(self, service: str) -> None:
|
| 132 |
+
pid = self._read_pid(service)
|
| 133 |
+
if not pid:
|
| 134 |
+
raise RuntimeError(f"Service not running: {service}")
|
| 135 |
+
os.kill(pid, signal.SIGHUP)
|
| 136 |
+
|
| 137 |
+
def wait_healthy(self, timeout_s: int = 30) -> bool:
|
| 138 |
+
deadline = time.time() + timeout_s
|
| 139 |
+
with httpx.Client(timeout=1.0) as client:
|
| 140 |
+
while time.time() < deadline:
|
| 141 |
+
try:
|
| 142 |
+
gateway_ok = (
|
| 143 |
+
client.get(self._health_urls["gateway"]).status_code == 200
|
| 144 |
+
)
|
| 145 |
+
auth_ok = client.get(self._health_urls["auth"]).status_code == 200
|
| 146 |
+
if gateway_ok and auth_ok:
|
| 147 |
+
return True
|
| 148 |
+
except Exception:
|
| 149 |
+
pass
|
| 150 |
+
time.sleep(1)
|
| 151 |
+
return False
|
| 152 |
+
|
| 153 |
+
def get_status(self) -> dict[str, str]:
|
| 154 |
+
status: dict[str, str] = {}
|
| 155 |
+
for service in ["gateway", "auth", "worker", "job_generator"]:
|
| 156 |
+
pid = self._read_pid(service)
|
| 157 |
+
status[service] = f"running pid={pid}" if pid else "stopped"
|
| 158 |
+
return status
|
| 159 |
+
|
| 160 |
+
def get_pid(self, service: str) -> int | None:
|
| 161 |
+
return self._read_pid(service)
|
| 162 |
+
|
| 163 |
+
def close(self) -> None:
|
| 164 |
+
self.stop_all()
|
setup-dev.sh
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 5 |
+
VENV_DIR="${ROOT_DIR}/.venv"
|
| 6 |
+
REQUIRED_PYTHON="3.12"
|
| 7 |
+
PYTHON_BIN="python3.12"
|
| 8 |
+
|
| 9 |
+
if ! command -v "$PYTHON_BIN" >/dev/null && command -v python3 >/dev/null; then
|
| 10 |
+
if [ "$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" = "$REQUIRED_PYTHON" ]; then
|
| 11 |
+
PYTHON_BIN="python3"
|
| 12 |
+
fi
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
require_cmd() {
|
| 16 |
+
local name="$1"
|
| 17 |
+
local install_hint="$2"
|
| 18 |
+
if ! command -v "$name" >/dev/null; then
|
| 19 |
+
echo "[ERROR] Missing required command: $name"
|
| 20 |
+
echo " Install hint: $install_hint"
|
| 21 |
+
exit 1
|
| 22 |
+
fi
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
optional_cmd_note() {
|
| 26 |
+
local name="$1"
|
| 27 |
+
local note="$2"
|
| 28 |
+
if ! command -v "$name" >/dev/null; then
|
| 29 |
+
echo "[WARN] Optional command not found: $name"
|
| 30 |
+
echo " $note"
|
| 31 |
+
fi
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
echo "[INFO] Preflight checks"
|
| 35 |
+
require_cmd "$PYTHON_BIN" "Install Python 3.12 (binary: python3.12 or python3==3.12)"
|
| 36 |
+
require_cmd bun "Install Bun: https://bun.sh"
|
| 37 |
+
require_cmd redis-server "Install Redis server"
|
| 38 |
+
require_cmd redis-cli "Install Redis CLI"
|
| 39 |
+
require_cmd curl "Install curl"
|
| 40 |
+
require_cmd jq "Install jq"
|
| 41 |
+
|
| 42 |
+
optional_cmd_note docker "Needed for local container validation and HF parity checks."
|
| 43 |
+
optional_cmd_note uv "Used for regenerating uv.lock when dependencies change."
|
| 44 |
+
|
| 45 |
+
echo "[INFO] Creating virtual environment"
|
| 46 |
+
if [ -x "$VENV_DIR/bin/python" ]; then
|
| 47 |
+
EXISTING_PY_VERSION="$($VENV_DIR/bin/python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
|
| 48 |
+
if [ "$EXISTING_PY_VERSION" != "$REQUIRED_PYTHON" ]; then
|
| 49 |
+
echo "[INFO] Recreating .venv with Python 3.12 (found $EXISTING_PY_VERSION)"
|
| 50 |
+
rm -rf "$VENV_DIR"
|
| 51 |
+
fi
|
| 52 |
+
fi
|
| 53 |
+
|
| 54 |
+
if [ ! -d "$VENV_DIR" ]; then
|
| 55 |
+
"$PYTHON_BIN" -m venv "$VENV_DIR"
|
| 56 |
+
fi
|
| 57 |
+
|
| 58 |
+
# shellcheck disable=SC1091
|
| 59 |
+
source "$VENV_DIR/bin/activate"
|
| 60 |
+
|
| 61 |
+
python -m pip install --upgrade pip
|
| 62 |
+
pip install -r "$ROOT_DIR/requirements.txt"
|
| 63 |
+
|
| 64 |
+
echo "[INFO] Installing Bun dependencies"
|
| 65 |
+
( cd "$ROOT_DIR/mesh/gateway" && bun install )
|
| 66 |
+
( cd "$ROOT_DIR/mesh/auth" && bun install )
|
| 67 |
+
( cd "$ROOT_DIR/mesh/worker" && bun install )
|
| 68 |
+
|
| 69 |
+
chmod +x "$ROOT_DIR/start.sh"
|
| 70 |
+
chmod +x "$ROOT_DIR/inference.py" || true
|
| 71 |
+
|
| 72 |
+
if command -v uv >/dev/null && [ ! -f "$ROOT_DIR/uv.lock" ]; then
|
| 73 |
+
echo "[INFO] Generating uv.lock"
|
| 74 |
+
( cd "$ROOT_DIR" && uv lock )
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
echo "[INFO] Running OpenEnv validation"
|
| 78 |
+
openenv validate "$ROOT_DIR"
|
| 79 |
+
|
| 80 |
+
echo "[INFO] Setup complete"
|
| 81 |
+
echo "[NEXT] Export required inference vars:"
|
| 82 |
+
echo " API_BASE_URL=<endpoint>"
|
| 83 |
+
echo " MODEL_NAME=<model>"
|
| 84 |
+
echo " HF_TOKEN=<api_key>"
|
| 85 |
+
echo "[NEXT] Start services: APP_ROOT=$ROOT_DIR MESH_ROOT=$ROOT_DIR/mesh ./start.sh"
|
| 86 |
+
echo "[NEXT] Run baseline: HF_TOKEN=... API_BASE_URL=... MODEL_NAME=... python inference.py"
|
start.sh
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
export APP_ROOT="${APP_ROOT:-/home/user/app}"
|
| 5 |
+
export MESH_ROOT="${MESH_ROOT:-/mesh}"
|
| 6 |
+
|
| 7 |
+
mkdir -p /tmp
|
| 8 |
+
|
| 9 |
+
redis-server --daemonize yes --logfile /tmp/redis.log --port 6379
|
| 10 |
+
until redis-cli ping >/dev/null; do sleep 0.2; done
|
| 11 |
+
|
| 12 |
+
rm -f /tmp/*.pid /tmp/*.log /tmp/worker_restart_count /tmp/consumer_stall_count /tmp/current_task
|
| 13 |
+
printf '0' > /tmp/worker_restart_count
|
| 14 |
+
printf '0' > /tmp/consumer_stall_count
|
| 15 |
+
|
| 16 |
+
cat > "${MESH_ROOT}/registry.json" <<'EOF'
|
| 17 |
+
{
|
| 18 |
+
"services": {
|
| 19 |
+
"auth": {"host": "localhost", "port": 3001, "protocol": "http"},
|
| 20 |
+
"redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
|
| 21 |
+
"worker": {"host": "localhost", "port": null, "protocol": "internal"}
|
| 22 |
+
}
|
| 23 |
+
}
|
| 24 |
+
EOF
|
| 25 |
+
|
| 26 |
+
: > /tmp/gateway.log
|
| 27 |
+
: > /tmp/auth.log
|
| 28 |
+
: > /tmp/worker.log
|
| 29 |
+
: > /tmp/job_gen.log
|
| 30 |
+
|
| 31 |
+
bun run "${APP_ROOT}/mesh/gateway/index.ts" >> /tmp/gateway.log &
|
| 32 |
+
echo $! > /tmp/gateway.pid
|
| 33 |
+
|
| 34 |
+
bun run "${APP_ROOT}/mesh/auth/index.ts" >> /tmp/auth.log &
|
| 35 |
+
echo $! > /tmp/auth.pid
|
| 36 |
+
|
| 37 |
+
bun run "${APP_ROOT}/mesh/worker/index.ts" >> /tmp/worker.log &
|
| 38 |
+
echo $! > /tmp/worker.pid
|
| 39 |
+
|
| 40 |
+
bun run "${APP_ROOT}/mesh/worker/job_generator.ts" >> /tmp/job_gen.log &
|
| 41 |
+
echo $! > /tmp/job_generator.pid
|
| 42 |
+
|
| 43 |
+
for _ in $(seq 1 45); do
|
| 44 |
+
if curl -sf http://localhost:3000/health >/dev/null && curl -sf http://localhost:3001/health >/dev/null; then
|
| 45 |
+
break
|
| 46 |
+
fi
|
| 47 |
+
sleep 1
|
| 48 |
+
done
|
| 49 |
+
|
| 50 |
+
exec uvicorn server.api:app --host 0.0.0.0 --port 8000
|
tests/test_api_reset.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi.testclient import TestClient
|
| 2 |
+
|
| 3 |
+
from server import api
|
| 4 |
+
from server.constants import TaskName
|
| 5 |
+
from server.models import Observation, SystemMetrics
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class _FakeEnv:
|
| 9 |
+
def __init__(self) -> None:
|
| 10 |
+
self.reset_calls: list[TaskName] = []
|
| 11 |
+
|
| 12 |
+
def start(self) -> None:
|
| 13 |
+
return None
|
| 14 |
+
|
| 15 |
+
def close(self) -> None:
|
| 16 |
+
return None
|
| 17 |
+
|
| 18 |
+
def reset(self, task_name: TaskName) -> Observation:
|
| 19 |
+
self.reset_calls.append(task_name)
|
| 20 |
+
return Observation(
|
| 21 |
+
command_output="ready",
|
| 22 |
+
metrics=SystemMetrics(
|
| 23 |
+
gateway_success_rate=0.0,
|
| 24 |
+
gateway_p99_latency_ms=0.0,
|
| 25 |
+
queue_depth=0,
|
| 26 |
+
worker_restart_count=0,
|
| 27 |
+
consumer_stall_count=0,
|
| 28 |
+
),
|
| 29 |
+
process_status={"gateway": "running"},
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_reset_defaults_to_cascading_timeout_when_task_missing(monkeypatch) -> None:
|
| 34 |
+
holder: dict[str, _FakeEnv] = {}
|
| 35 |
+
|
| 36 |
+
def fake_env_factory() -> _FakeEnv:
|
| 37 |
+
env = _FakeEnv()
|
| 38 |
+
holder["env"] = env
|
| 39 |
+
return env
|
| 40 |
+
|
| 41 |
+
monkeypatch.setattr(api, "DistributedDebugEnv", fake_env_factory)
|
| 42 |
+
|
| 43 |
+
with TestClient(api.app) as client:
|
| 44 |
+
response = client.post("/reset", json={})
|
| 45 |
+
|
| 46 |
+
assert response.status_code == 200
|
| 47 |
+
assert holder["env"].reset_calls == [TaskName.CASCADING_TIMEOUT]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_reset_rejects_unknown_explicit_task(monkeypatch) -> None:
|
| 51 |
+
holder: dict[str, _FakeEnv] = {}
|
| 52 |
+
|
| 53 |
+
def fake_env_factory() -> _FakeEnv:
|
| 54 |
+
env = _FakeEnv()
|
| 55 |
+
holder["env"] = env
|
| 56 |
+
return env
|
| 57 |
+
|
| 58 |
+
monkeypatch.setattr(api, "DistributedDebugEnv", fake_env_factory)
|
| 59 |
+
|
| 60 |
+
with TestClient(api.app) as client:
|
| 61 |
+
response = client.post("/reset", params={"task_name": "not-a-task"}, json={})
|
| 62 |
+
|
| 63 |
+
assert response.status_code == 400
|
| 64 |
+
assert response.json()["detail"] == "Unknown task: not-a-task"
|
| 65 |
+
assert holder["env"].reset_calls == []
|
tests/test_env_task_context.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from server.constants import DEFAULT_CONFIGS
|
| 5 |
+
from server.env import DistributedDebugEnv
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _write_json(path: Path, payload: dict) -> None:
|
| 9 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 10 |
+
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_restore_defaults_adds_job_generator_config(tmp_path: Path) -> None:
|
| 14 |
+
mesh_root = tmp_path / "mesh"
|
| 15 |
+
env = DistributedDebugEnv(project_root=tmp_path, mesh_root=mesh_root)
|
| 16 |
+
|
| 17 |
+
env._restore_defaults()
|
| 18 |
+
|
| 19 |
+
payload = json.loads(
|
| 20 |
+
(mesh_root / "worker" / "job_generator_config.json").read_text()
|
| 21 |
+
)
|
| 22 |
+
assert payload == DEFAULT_CONFIGS["job_generator"]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_registry_auth_matches_default_detects_corruption(tmp_path: Path) -> None:
|
| 26 |
+
mesh_root = tmp_path / "mesh"
|
| 27 |
+
env = DistributedDebugEnv(project_root=tmp_path, mesh_root=mesh_root)
|
| 28 |
+
env._restore_defaults()
|
| 29 |
+
|
| 30 |
+
assert env._is_registry_auth_default() is True
|
| 31 |
+
|
| 32 |
+
_write_json(
|
| 33 |
+
mesh_root / "registry.json",
|
| 34 |
+
{
|
| 35 |
+
"services": {
|
| 36 |
+
"auth": {"host": "invalid-host", "port": 3001, "protocol": "http"},
|
| 37 |
+
"redis": {"host": "localhost", "port": 6379, "protocol": "tcp"},
|
| 38 |
+
"worker": {"host": "localhost", "port": None, "protocol": "internal"},
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
assert env._is_registry_auth_default() is False
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_job_generator_rate_resolved_uses_config(tmp_path: Path) -> None:
|
| 47 |
+
mesh_root = tmp_path / "mesh"
|
| 48 |
+
env = DistributedDebugEnv(project_root=tmp_path, mesh_root=mesh_root)
|
| 49 |
+
env._restore_defaults()
|
| 50 |
+
|
| 51 |
+
assert env._job_generator_interval_ms() == 333
|
| 52 |
+
assert env._is_job_generator_rate_resolved() is True
|
| 53 |
+
|
| 54 |
+
_write_json(mesh_root / "worker" / "job_generator_config.json", {"interval_ms": 10})
|
| 55 |
+
|
| 56 |
+
assert env._job_generator_interval_ms() == 10
|
| 57 |
+
assert env._is_job_generator_rate_resolved() is False
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.graders import (
|
| 2 |
+
grade_backpressure_cascade,
|
| 3 |
+
grade_byzantine_queue_fault,
|
| 4 |
+
grade_cascading_timeout,
|
| 5 |
+
grade_job_generator_runaway,
|
| 6 |
+
grade_registry_corruption,
|
| 7 |
+
grade_distributed_lock_starvation,
|
| 8 |
+
grade_route_partition,
|
| 9 |
+
)
|
| 10 |
+
from server.models import SystemMetrics
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _metrics(
|
| 14 |
+
*,
|
| 15 |
+
success_rate: float = 0.0,
|
| 16 |
+
p99: float = 1000.0,
|
| 17 |
+
depth: int = 10,
|
| 18 |
+
restarts: int = 0,
|
| 19 |
+
stalls: int = 0,
|
| 20 |
+
) -> SystemMetrics:
|
| 21 |
+
return SystemMetrics(
|
| 22 |
+
gateway_success_rate=success_rate,
|
| 23 |
+
gateway_p99_latency_ms=p99,
|
| 24 |
+
queue_depth=depth,
|
| 25 |
+
worker_restart_count=restarts,
|
| 26 |
+
consumer_stall_count=stalls,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_grade_cascading_timeout_boundaries() -> None:
|
| 31 |
+
assert (
|
| 32 |
+
grade_cascading_timeout(
|
| 33 |
+
_metrics(success_rate=1.0), {"cascading_timeout_resolved": True}
|
| 34 |
+
)
|
| 35 |
+
== 1.0
|
| 36 |
+
)
|
| 37 |
+
assert (
|
| 38 |
+
grade_cascading_timeout(
|
| 39 |
+
_metrics(success_rate=1.0), {"cascading_timeout_resolved": False}
|
| 40 |
+
)
|
| 41 |
+
== 0.25
|
| 42 |
+
)
|
| 43 |
+
assert (
|
| 44 |
+
grade_cascading_timeout(
|
| 45 |
+
_metrics(success_rate=0.5), {"cascading_timeout_resolved": False}
|
| 46 |
+
)
|
| 47 |
+
== 0.125
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def test_grade_byzantine_queue_fault_cases() -> None:
|
| 52 |
+
ctx = {"baseline_worker_restart_count": 3}
|
| 53 |
+
assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=3), ctx) == 1.0
|
| 54 |
+
assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=8), ctx) == 0.6
|
| 55 |
+
assert grade_byzantine_queue_fault(_metrics(depth=40, restarts=10), ctx) == 0.0
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_grade_distributed_lock_starvation_cases() -> None:
|
| 59 |
+
ctx_locked = {"baseline_consumer_stall_count": 0, "lock_exists": True}
|
| 60 |
+
ctx_unlocked = {"baseline_consumer_stall_count": 0, "lock_exists": False}
|
| 61 |
+
|
| 62 |
+
assert (
|
| 63 |
+
grade_distributed_lock_starvation(_metrics(depth=2, stalls=0), ctx_unlocked)
|
| 64 |
+
== 1.0
|
| 65 |
+
)
|
| 66 |
+
assert (
|
| 67 |
+
grade_distributed_lock_starvation(_metrics(depth=10, stalls=0), ctx_unlocked)
|
| 68 |
+
== 0.6
|
| 69 |
+
)
|
| 70 |
+
assert (
|
| 71 |
+
grade_distributed_lock_starvation(_metrics(depth=10, stalls=3), ctx_locked)
|
| 72 |
+
== 0.0
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_grade_backpressure_cascade_continuous() -> None:
|
| 77 |
+
assert grade_backpressure_cascade(_metrics(depth=0), {}) == 1.0
|
| 78 |
+
assert grade_backpressure_cascade(_metrics(depth=100), {}) == 0.5
|
| 79 |
+
assert grade_backpressure_cascade(_metrics(depth=200), {}) == 0.0
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def test_grade_route_partition_threshold() -> None:
|
| 83 |
+
assert (
|
| 84 |
+
grade_route_partition(_metrics(success_rate=0.96), {"route_blocked": False})
|
| 85 |
+
== 1.0
|
| 86 |
+
)
|
| 87 |
+
assert (
|
| 88 |
+
grade_route_partition(_metrics(success_rate=0.8), {"route_blocked": True})
|
| 89 |
+
== 0.0
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def test_grade_registry_corruption_thresholds() -> None:
|
| 94 |
+
assert (
|
| 95 |
+
grade_registry_corruption(
|
| 96 |
+
_metrics(success_rate=0.99), {"registry_auth_matches_default": True}
|
| 97 |
+
)
|
| 98 |
+
== 1.0
|
| 99 |
+
)
|
| 100 |
+
assert (
|
| 101 |
+
grade_registry_corruption(
|
| 102 |
+
_metrics(success_rate=0.8), {"registry_auth_matches_default": True}
|
| 103 |
+
)
|
| 104 |
+
== 0.9
|
| 105 |
+
)
|
| 106 |
+
assert (
|
| 107 |
+
grade_registry_corruption(
|
| 108 |
+
_metrics(success_rate=1.0), {"registry_auth_matches_default": False}
|
| 109 |
+
)
|
| 110 |
+
== 0.3
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def test_grade_job_generator_runaway_thresholds() -> None:
|
| 115 |
+
assert (
|
| 116 |
+
grade_job_generator_runaway(
|
| 117 |
+
_metrics(depth=4), {"job_generator_rate_resolved": True}
|
| 118 |
+
)
|
| 119 |
+
== 1.0
|
| 120 |
+
)
|
| 121 |
+
assert (
|
| 122 |
+
grade_job_generator_runaway(
|
| 123 |
+
_metrics(depth=20), {"job_generator_rate_resolved": True}
|
| 124 |
+
)
|
| 125 |
+
== 0.7
|
| 126 |
+
)
|
| 127 |
+
assert (
|
| 128 |
+
grade_job_generator_runaway(
|
| 129 |
+
_metrics(depth=20), {"job_generator_rate_resolved": False}
|
| 130 |
+
)
|
| 131 |
+
== 0.2
|
| 132 |
+
)
|
tests/test_inference_format.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from server.constants import TaskName
|
| 4 |
+
from inference import (
|
| 5 |
+
_attempt_history_block,
|
| 6 |
+
_episode_score,
|
| 7 |
+
_format_end_line,
|
| 8 |
+
_parse_tasks,
|
| 9 |
+
_single_line,
|
| 10 |
+
_task_symptom_block,
|
| 11 |
+
build_prompt,
|
| 12 |
+
extract_command,
|
| 13 |
+
extract_reasoning,
|
| 14 |
+
)
|
| 15 |
+
from server.models import Observation, SystemMetrics
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_extract_command_rejects_non_json_code_fence() -> None:
|
| 19 |
+
raw = "```bash\nredis-cli LLEN job_queue\n```"
|
| 20 |
+
assert extract_command(raw) is None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_extract_command_returns_none_when_empty() -> None:
|
| 24 |
+
assert extract_command(" ") is None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_extract_command_reads_json_payload() -> None:
|
| 28 |
+
raw = '{"command":"redis-cli LLEN job_queue"}'
|
| 29 |
+
assert extract_command(raw) == "redis-cli LLEN job_queue"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_extract_command_reads_fenced_json_payload() -> None:
|
| 33 |
+
raw = '```json\n{"command":"ps -ef"}\n```'
|
| 34 |
+
assert extract_command(raw) == "ps -ef"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_extract_command_reads_json_embedded_in_text() -> None:
|
| 38 |
+
raw = 'Use this command: {"command":"redis-cli LLEN job_queue"} thanks.'
|
| 39 |
+
assert extract_command(raw) == "redis-cli LLEN job_queue"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_extract_command_reads_json_after_reasoning_preamble() -> None:
|
| 43 |
+
raw = (
|
| 44 |
+
"I'll start by checking process state.\n"
|
| 45 |
+
'{"command":"ps aux","reasoning":"list processes"}'
|
| 46 |
+
)
|
| 47 |
+
assert extract_command(raw) == "ps aux"
|
| 48 |
+
assert extract_reasoning(raw) == "list processes"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def test_extract_command_prefers_first_json_object_with_command() -> None:
|
| 52 |
+
raw = '{"meta":"skip"} then {"command":"ls -la","reasoning":"explore"}'
|
| 53 |
+
assert extract_command(raw) == "ls -la"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def test_extract_reasoning_when_present() -> None:
|
| 57 |
+
raw = '{"command":"redis-cli LLEN job_queue","reasoning":"check queue depth first"}'
|
| 58 |
+
assert extract_command(raw) == "redis-cli LLEN job_queue"
|
| 59 |
+
assert extract_reasoning(raw) == "check queue depth first"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_extract_command_requires_command_even_with_reasoning() -> None:
|
| 63 |
+
raw = '{"reasoning":"i should inspect logs"}'
|
| 64 |
+
assert extract_command(raw) is None
|
| 65 |
+
assert extract_reasoning(raw) is None
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_single_line_removes_newlines() -> None:
|
| 69 |
+
assert _single_line("echo a\necho b") == "echo a echo b"
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def test_task_symptom_block_is_non_empty() -> None:
|
| 73 |
+
block = _task_symptom_block(TaskName.ROUTE_PARTITION)
|
| 74 |
+
assert "connectivity path issue" in block
|
| 75 |
+
assert "route-partition" not in block
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_task_symptom_block_includes_new_tasks() -> None:
|
| 79 |
+
registry_block = _task_symptom_block(TaskName.REGISTRY_CORRUPTION)
|
| 80 |
+
runaway_block = _task_symptom_block(TaskName.JOB_GENERATOR_RUNAWAY)
|
| 81 |
+
|
| 82 |
+
assert "registry" in registry_block.lower()
|
| 83 |
+
assert "queue" in runaway_block.lower()
|
| 84 |
+
assert "job-generator-runaway" not in runaway_block
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def test_attempt_history_block_renders_all_attempts() -> None:
|
| 88 |
+
attempts = [
|
| 89 |
+
{
|
| 90 |
+
"step": 1,
|
| 91 |
+
"command": "redis-cli LLEN job_queue",
|
| 92 |
+
"reasoning": "check backlog",
|
| 93 |
+
"reward": 0.12,
|
| 94 |
+
"error": None,
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"step": 2,
|
| 98 |
+
"command": "curl -s localhost:3000/health",
|
| 99 |
+
"reasoning": None,
|
| 100 |
+
"reward": 0.08,
|
| 101 |
+
"error": "timeout",
|
| 102 |
+
},
|
| 103 |
+
]
|
| 104 |
+
block = _attempt_history_block(attempts)
|
| 105 |
+
assert "step 1: command=redis-cli LLEN job_queue" in block
|
| 106 |
+
assert "step 2: command=curl -s localhost:3000/health" in block
|
| 107 |
+
assert "reasoning=check backlog" in block
|
| 108 |
+
assert "error=timeout" in block
|
| 109 |
+
assert "reward=" not in block
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_build_prompt_contains_symptoms_and_history() -> None:
|
| 113 |
+
obs = Observation(
|
| 114 |
+
command_output="service checks show partial failures",
|
| 115 |
+
metrics=SystemMetrics(
|
| 116 |
+
gateway_success_rate=0.32,
|
| 117 |
+
gateway_p99_latency_ms=1500.0,
|
| 118 |
+
queue_depth=412,
|
| 119 |
+
worker_restart_count=3,
|
| 120 |
+
consumer_stall_count=2,
|
| 121 |
+
),
|
| 122 |
+
process_status={"gateway": "running", "worker": "running"},
|
| 123 |
+
)
|
| 124 |
+
prompt = build_prompt(
|
| 125 |
+
obs=obs,
|
| 126 |
+
step_num=3,
|
| 127 |
+
task_name=TaskName.BACKPRESSURE_CASCADE,
|
| 128 |
+
attempt_history=[
|
| 129 |
+
{
|
| 130 |
+
"step": 1,
|
| 131 |
+
"command": "redis-cli LLEN job_queue",
|
| 132 |
+
"reasoning": "measure backlog",
|
| 133 |
+
"reward": 0.10,
|
| 134 |
+
"error": None,
|
| 135 |
+
}
|
| 136 |
+
],
|
| 137 |
+
)
|
| 138 |
+
assert "TASK SYMPTOMS:" in prompt
|
| 139 |
+
assert "PREVIOUS ATTEMPTS:" in prompt
|
| 140 |
+
assert "step 1: command=redis-cli LLEN job_queue" in prompt
|
| 141 |
+
assert "LATEST COMMAND OUTPUT:" in prompt
|
| 142 |
+
assert "reward=" not in prompt
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def test_parse_tasks_default_and_override() -> None:
|
| 146 |
+
previous = os.getenv("TASKS_CSV")
|
| 147 |
+
try:
|
| 148 |
+
os.environ.pop("TASKS_CSV", None)
|
| 149 |
+
default_tasks = _parse_tasks()
|
| 150 |
+
assert default_tasks == [
|
| 151 |
+
TaskName.CASCADING_TIMEOUT,
|
| 152 |
+
TaskName.BYZANTINE_QUEUE_FAULT,
|
| 153 |
+
TaskName.DISTRIBUTED_LOCK_STARVATION,
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
os.environ["TASKS_CSV"] = "route-partition,backpressure-cascade"
|
| 157 |
+
assert _parse_tasks() == [
|
| 158 |
+
TaskName.ROUTE_PARTITION,
|
| 159 |
+
TaskName.BACKPRESSURE_CASCADE,
|
| 160 |
+
]
|
| 161 |
+
|
| 162 |
+
os.environ["TASKS_CSV"] = "registry-corruption,job-generator-runaway"
|
| 163 |
+
assert _parse_tasks() == [
|
| 164 |
+
TaskName.REGISTRY_CORRUPTION,
|
| 165 |
+
TaskName.JOB_GENERATOR_RUNAWAY,
|
| 166 |
+
]
|
| 167 |
+
finally:
|
| 168 |
+
if previous is None:
|
| 169 |
+
os.environ.pop("TASKS_CSV", None)
|
| 170 |
+
else:
|
| 171 |
+
os.environ["TASKS_CSV"] = previous
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def test_episode_score_clamps_terminal_reward_to_unit_interval() -> None:
|
| 176 |
+
assert _episode_score([]) == 0.0
|
| 177 |
+
assert _episode_score([0.2, 0.8]) == 0.8
|
| 178 |
+
assert _episode_score([1.2]) == 1.0
|
| 179 |
+
assert _episode_score([-0.1]) == 0.0
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def test_end_log_line_includes_score_and_reward_list() -> None:
|
| 183 |
+
line = _format_end_line(success=True, steps=3, score=0.987, rewards=[0.0, 0.125, 1.0])
|
| 184 |
+
assert line == (
|
| 185 |
+
"[END] success=true steps=3 score=0.99 rewards=0.00,0.12,1.00"
|
| 186 |
+
)
|
tests/test_models.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import ValidationError
|
| 2 |
+
|
| 3 |
+
from server.models import Action, Observation, SystemMetrics
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_system_metrics_rejects_success_rate_above_one() -> None:
|
| 7 |
+
try:
|
| 8 |
+
SystemMetrics(
|
| 9 |
+
gateway_success_rate=1.2,
|
| 10 |
+
gateway_p99_latency_ms=20,
|
| 11 |
+
queue_depth=0,
|
| 12 |
+
worker_restart_count=0,
|
| 13 |
+
consumer_stall_count=0,
|
| 14 |
+
)
|
| 15 |
+
except ValidationError:
|
| 16 |
+
return
|
| 17 |
+
raise AssertionError("Expected ValidationError for success rate > 1.0")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_observation_roundtrip() -> None:
|
| 21 |
+
original = Observation(
|
| 22 |
+
command_output="ok",
|
| 23 |
+
metrics=SystemMetrics(
|
| 24 |
+
gateway_success_rate=0.7,
|
| 25 |
+
gateway_p99_latency_ms=123,
|
| 26 |
+
queue_depth=3,
|
| 27 |
+
worker_restart_count=1,
|
| 28 |
+
consumer_stall_count=2,
|
| 29 |
+
),
|
| 30 |
+
process_status={"gateway": "running pid=42"},
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
restored = Observation.model_validate_json(original.model_dump_json())
|
| 34 |
+
assert restored == original
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_action_rejects_empty_command() -> None:
|
| 38 |
+
try:
|
| 39 |
+
Action(command=" ")
|
| 40 |
+
except ValidationError:
|
| 41 |
+
return
|
| 42 |
+
raise AssertionError("Expected ValidationError for empty command")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|