Upload folder using huggingface_hub
Browse files- Dockerfile +1 -0
- README.md +11 -0
- __init__.py +13 -0
- client.py +56 -0
- models.py +9 -0
Dockerfile
CHANGED
|
@@ -37,4 +37,5 @@ RUN chmod +x ./start.sh
|
|
| 37 |
RUN ln -sfn /home/user/app/mesh /mesh
|
| 38 |
|
| 39 |
EXPOSE 8000
|
|
|
|
| 40 |
CMD ["./start.sh"]
|
|
|
|
| 37 |
RUN ln -sfn /home/user/app/mesh /mesh
|
| 38 |
|
| 39 |
EXPOSE 8000
|
| 40 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 41 |
CMD ["./start.sh"]
|
README.md
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Distributed Systems Debug Environment
|
| 2 |
|
| 3 |
## Overview
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: distributed-systems-debug-env
|
| 3 |
+
sdk: docker
|
| 4 |
+
app_port: 8000
|
| 5 |
+
colorFrom: blue
|
| 6 |
+
colorTo: indigo
|
| 7 |
+
short_description: OpenEnv RL env for debugging distributed systems failures.
|
| 8 |
+
base_path: /web
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
|
| 12 |
# Distributed Systems Debug Environment
|
| 13 |
|
| 14 |
## Overview
|
__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Distributed systems debug OpenEnv package."""
|
| 2 |
+
|
| 3 |
+
from .client import DistributedSystemsDebugEnv
|
| 4 |
+
from .models import Action, Observation, Reward, StepResult, SystemMetrics
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"DistributedSystemsDebugEnv",
|
| 8 |
+
"Action",
|
| 9 |
+
"Observation",
|
| 10 |
+
"Reward",
|
| 11 |
+
"StepResult",
|
| 12 |
+
"SystemMetrics",
|
| 13 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
|
| 3 |
+
from openenv.core import EnvClient
|
| 4 |
+
from openenv.core.client_types import StepResult as ClientStepResult
|
| 5 |
+
from openenv.core.env_server.types import State
|
| 6 |
+
|
| 7 |
+
from .models import Action, Observation, SystemMetrics
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class DistributedSystemsDebugEnv(EnvClient[Action, Observation]):
|
| 11 |
+
"""Client wrapper around the environment HTTP API."""
|
| 12 |
+
|
| 13 |
+
def _step_payload(self, action: Action) -> dict[str, Any]:
|
| 14 |
+
return action.model_dump()
|
| 15 |
+
|
| 16 |
+
def _parse_result(self, payload: dict[str, Any]) -> ClientStepResult[Observation]:
|
| 17 |
+
observation_payload = payload.get("observation") or {}
|
| 18 |
+
metrics_payload = observation_payload.get("metrics") or {}
|
| 19 |
+
|
| 20 |
+
observation = Observation(
|
| 21 |
+
command_output=str(observation_payload.get("command_output") or ""),
|
| 22 |
+
metrics=SystemMetrics(
|
| 23 |
+
gateway_success_rate=float(
|
| 24 |
+
metrics_payload.get("gateway_success_rate", 0.0)
|
| 25 |
+
),
|
| 26 |
+
gateway_p99_latency_ms=float(
|
| 27 |
+
metrics_payload.get("gateway_p99_latency_ms", 0.0)
|
| 28 |
+
),
|
| 29 |
+
queue_depth=int(metrics_payload.get("queue_depth", 0)),
|
| 30 |
+
worker_restart_count=int(
|
| 31 |
+
metrics_payload.get("worker_restart_count", 0)
|
| 32 |
+
),
|
| 33 |
+
consumer_stall_count=int(
|
| 34 |
+
metrics_payload.get("consumer_stall_count", 0)
|
| 35 |
+
),
|
| 36 |
+
),
|
| 37 |
+
process_status={
|
| 38 |
+
str(key): str(value)
|
| 39 |
+
for key, value in dict(
|
| 40 |
+
observation_payload.get("process_status") or {}
|
| 41 |
+
).items()
|
| 42 |
+
},
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
reward = payload.get("reward")
|
| 46 |
+
return ClientStepResult(
|
| 47 |
+
observation=observation,
|
| 48 |
+
reward=float(reward) if reward is not None else None,
|
| 49 |
+
done=bool(payload.get("done", False)),
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
def _parse_state(self, payload: dict[str, Any]) -> State:
|
| 53 |
+
return State(
|
| 54 |
+
episode_id=payload.get("task"),
|
| 55 |
+
step_count=int(payload.get("step_count", 0)),
|
| 56 |
+
)
|
models.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.models import Action, Observation, Reward, StepResult, SystemMetrics
|
| 2 |
+
|
| 3 |
+
__all__ = [
|
| 4 |
+
"Action",
|
| 5 |
+
"Observation",
|
| 6 |
+
"Reward",
|
| 7 |
+
"StepResult",
|
| 8 |
+
"SystemMetrics",
|
| 9 |
+
]
|