Upload folder using huggingface_hub
Browse files- __init__.py +1 -13
- server/graders.py +15 -14
- tests/test_graders.py +11 -11
__init__.py
CHANGED
|
@@ -1,13 +1 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
from .client import DistributedSystemsDebugEnv
|
| 4 |
-
from .models import Action, Observation, Reward, StepResult, SystemMetrics
|
| 5 |
-
|
| 6 |
-
__all__ = [
|
| 7 |
-
"DistributedSystemsDebugEnv",
|
| 8 |
-
"Action",
|
| 9 |
-
"Observation",
|
| 10 |
-
"Reward",
|
| 11 |
-
"StepResult",
|
| 12 |
-
"SystemMetrics",
|
| 13 |
-
]
|
|
|
|
| 1 |
+
"""Package marker for OpenEnv structure validation."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/graders.py
CHANGED
|
@@ -5,13 +5,14 @@ from .models import SystemMetrics
|
|
| 5 |
|
| 6 |
|
| 7 |
def _clamp(score: float) -> float:
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def grade_cascading_timeout(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 12 |
timeout_resolved = bool(context.get("cascading_timeout_resolved", False))
|
| 13 |
if timeout_resolved and metrics.gateway_success_rate >= 0.99:
|
| 14 |
-
return 1.0
|
| 15 |
if not timeout_resolved:
|
| 16 |
# Prevent instant pass while the injected timeout fault is still active.
|
| 17 |
return _clamp(metrics.gateway_success_rate * 0.25)
|
|
@@ -25,9 +26,9 @@ def grade_byzantine_queue_fault(
|
|
| 25 |
restart_delta = max(0, metrics.worker_restart_count - baseline_restart)
|
| 26 |
|
| 27 |
if metrics.queue_depth == 0 and restart_delta <= 1:
|
| 28 |
-
return 1.0
|
| 29 |
if metrics.queue_depth == 0:
|
| 30 |
-
return 0.6
|
| 31 |
|
| 32 |
queue_component = max(0.0, 1.0 - metrics.queue_depth / 50.0)
|
| 33 |
stability_penalty = min(0.4, restart_delta * 0.05)
|
|
@@ -42,12 +43,12 @@ def grade_distributed_lock_starvation(
|
|
| 42 |
stall_delta = max(0, metrics.consumer_stall_count - baseline_stall)
|
| 43 |
|
| 44 |
if not lock_exists and metrics.queue_depth <= 3:
|
| 45 |
-
return 1.0
|
| 46 |
if not lock_exists:
|
| 47 |
-
return 0.6
|
| 48 |
|
| 49 |
# If lock still exists, reward slight progress only when stalls don't explode.
|
| 50 |
-
return 0.2 if stall_delta <= 1 else 0.0
|
| 51 |
|
| 52 |
|
| 53 |
def grade_backpressure_cascade(metrics: SystemMetrics, _: dict[str, Any]) -> float:
|
|
@@ -57,10 +58,10 @@ def grade_backpressure_cascade(metrics: SystemMetrics, _: dict[str, Any]) -> flo
|
|
| 57 |
def grade_route_partition(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 58 |
route_blocked = bool(context.get("route_blocked", True))
|
| 59 |
if not route_blocked and metrics.gateway_success_rate >= 0.95:
|
| 60 |
-
return 1.0
|
| 61 |
if not route_blocked:
|
| 62 |
return _clamp(metrics.gateway_success_rate)
|
| 63 |
-
return 0.0
|
| 64 |
|
| 65 |
|
| 66 |
def grade_registry_corruption(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
|
@@ -68,7 +69,7 @@ def grade_registry_corruption(metrics: SystemMetrics, context: dict[str, Any]) -
|
|
| 68 |
context.get("registry_auth_matches_default", False)
|
| 69 |
)
|
| 70 |
if registry_auth_matches_default and metrics.gateway_success_rate >= 0.99:
|
| 71 |
-
return 1.0
|
| 72 |
if registry_auth_matches_default:
|
| 73 |
return _clamp(0.5 + metrics.gateway_success_rate * 0.5)
|
| 74 |
return _clamp(metrics.gateway_success_rate * 0.3)
|
|
@@ -79,12 +80,12 @@ def grade_job_generator_runaway(
|
|
| 79 |
) -> float:
|
| 80 |
rate_resolved = bool(context.get("job_generator_rate_resolved", False))
|
| 81 |
if rate_resolved and metrics.queue_depth <= 5:
|
| 82 |
-
return 1.0
|
| 83 |
if rate_resolved and metrics.queue_depth <= 30:
|
| 84 |
-
return 0.7
|
| 85 |
if rate_resolved:
|
| 86 |
return _clamp(0.7 - (metrics.queue_depth - 30) / 100.0)
|
| 87 |
-
return 0.2 if metrics.queue_depth <= 30 else 0.0
|
| 88 |
|
| 89 |
|
| 90 |
def grade_task(
|
|
@@ -106,4 +107,4 @@ def grade_task(
|
|
| 106 |
return grade_registry_corruption(metrics, context)
|
| 107 |
if task is TaskName.JOB_GENERATOR_RUNAWAY:
|
| 108 |
return grade_job_generator_runaway(metrics, context)
|
| 109 |
-
return 0.0
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
def _clamp(score: float) -> float:
|
| 8 |
+
"""Clamp score to the open interval (0, 1), as required by the evaluation pipeline."""
|
| 9 |
+
return max(0.01, min(0.99, score))
|
| 10 |
|
| 11 |
|
| 12 |
def grade_cascading_timeout(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 13 |
timeout_resolved = bool(context.get("cascading_timeout_resolved", False))
|
| 14 |
if timeout_resolved and metrics.gateway_success_rate >= 0.99:
|
| 15 |
+
return _clamp(1.0)
|
| 16 |
if not timeout_resolved:
|
| 17 |
# Prevent instant pass while the injected timeout fault is still active.
|
| 18 |
return _clamp(metrics.gateway_success_rate * 0.25)
|
|
|
|
| 26 |
restart_delta = max(0, metrics.worker_restart_count - baseline_restart)
|
| 27 |
|
| 28 |
if metrics.queue_depth == 0 and restart_delta <= 1:
|
| 29 |
+
return _clamp(1.0)
|
| 30 |
if metrics.queue_depth == 0:
|
| 31 |
+
return _clamp(0.6)
|
| 32 |
|
| 33 |
queue_component = max(0.0, 1.0 - metrics.queue_depth / 50.0)
|
| 34 |
stability_penalty = min(0.4, restart_delta * 0.05)
|
|
|
|
| 43 |
stall_delta = max(0, metrics.consumer_stall_count - baseline_stall)
|
| 44 |
|
| 45 |
if not lock_exists and metrics.queue_depth <= 3:
|
| 46 |
+
return _clamp(1.0)
|
| 47 |
if not lock_exists:
|
| 48 |
+
return _clamp(0.6)
|
| 49 |
|
| 50 |
# If lock still exists, reward slight progress only when stalls don't explode.
|
| 51 |
+
return _clamp(0.2) if stall_delta <= 1 else _clamp(0.0)
|
| 52 |
|
| 53 |
|
| 54 |
def grade_backpressure_cascade(metrics: SystemMetrics, _: dict[str, Any]) -> float:
|
|
|
|
| 58 |
def grade_route_partition(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
| 59 |
route_blocked = bool(context.get("route_blocked", True))
|
| 60 |
if not route_blocked and metrics.gateway_success_rate >= 0.95:
|
| 61 |
+
return _clamp(1.0)
|
| 62 |
if not route_blocked:
|
| 63 |
return _clamp(metrics.gateway_success_rate)
|
| 64 |
+
return _clamp(0.0)
|
| 65 |
|
| 66 |
|
| 67 |
def grade_registry_corruption(metrics: SystemMetrics, context: dict[str, Any]) -> float:
|
|
|
|
| 69 |
context.get("registry_auth_matches_default", False)
|
| 70 |
)
|
| 71 |
if registry_auth_matches_default and metrics.gateway_success_rate >= 0.99:
|
| 72 |
+
return _clamp(1.0)
|
| 73 |
if registry_auth_matches_default:
|
| 74 |
return _clamp(0.5 + metrics.gateway_success_rate * 0.5)
|
| 75 |
return _clamp(metrics.gateway_success_rate * 0.3)
|
|
|
|
| 80 |
) -> float:
|
| 81 |
rate_resolved = bool(context.get("job_generator_rate_resolved", False))
|
| 82 |
if rate_resolved and metrics.queue_depth <= 5:
|
| 83 |
+
return _clamp(1.0)
|
| 84 |
if rate_resolved and metrics.queue_depth <= 30:
|
| 85 |
+
return _clamp(0.7)
|
| 86 |
if rate_resolved:
|
| 87 |
return _clamp(0.7 - (metrics.queue_depth - 30) / 100.0)
|
| 88 |
+
return _clamp(0.2) if metrics.queue_depth <= 30 else _clamp(0.0)
|
| 89 |
|
| 90 |
|
| 91 |
def grade_task(
|
|
|
|
| 107 |
return grade_registry_corruption(metrics, context)
|
| 108 |
if task is TaskName.JOB_GENERATOR_RUNAWAY:
|
| 109 |
return grade_job_generator_runaway(metrics, context)
|
| 110 |
+
return _clamp(0.0)
|
tests/test_graders.py
CHANGED
|
@@ -32,7 +32,7 @@ def test_grade_cascading_timeout_boundaries() -> None:
|
|
| 32 |
grade_cascading_timeout(
|
| 33 |
_metrics(success_rate=1.0), {"cascading_timeout_resolved": True}
|
| 34 |
)
|
| 35 |
-
==
|
| 36 |
)
|
| 37 |
assert (
|
| 38 |
grade_cascading_timeout(
|
|
@@ -50,9 +50,9 @@ def test_grade_cascading_timeout_boundaries() -> None:
|
|
| 50 |
|
| 51 |
def test_grade_byzantine_queue_fault_cases() -> None:
|
| 52 |
ctx = {"baseline_worker_restart_count": 3}
|
| 53 |
-
assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=3), ctx) ==
|
| 54 |
assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=8), ctx) == 0.6
|
| 55 |
-
assert grade_byzantine_queue_fault(_metrics(depth=40, restarts=10), ctx) == 0.
|
| 56 |
|
| 57 |
|
| 58 |
def test_grade_distributed_lock_starvation_cases() -> None:
|
|
@@ -61,7 +61,7 @@ def test_grade_distributed_lock_starvation_cases() -> None:
|
|
| 61 |
|
| 62 |
assert (
|
| 63 |
grade_distributed_lock_starvation(_metrics(depth=2, stalls=0), ctx_unlocked)
|
| 64 |
-
==
|
| 65 |
)
|
| 66 |
assert (
|
| 67 |
grade_distributed_lock_starvation(_metrics(depth=10, stalls=0), ctx_unlocked)
|
|
@@ -69,24 +69,24 @@ def test_grade_distributed_lock_starvation_cases() -> None:
|
|
| 69 |
)
|
| 70 |
assert (
|
| 71 |
grade_distributed_lock_starvation(_metrics(depth=10, stalls=3), ctx_locked)
|
| 72 |
-
== 0.
|
| 73 |
)
|
| 74 |
|
| 75 |
|
| 76 |
def test_grade_backpressure_cascade_continuous() -> None:
|
| 77 |
-
assert grade_backpressure_cascade(_metrics(depth=0), {}) ==
|
| 78 |
assert grade_backpressure_cascade(_metrics(depth=100), {}) == 0.5
|
| 79 |
-
assert grade_backpressure_cascade(_metrics(depth=200), {}) == 0.
|
| 80 |
|
| 81 |
|
| 82 |
def test_grade_route_partition_threshold() -> None:
|
| 83 |
assert (
|
| 84 |
grade_route_partition(_metrics(success_rate=0.96), {"route_blocked": False})
|
| 85 |
-
==
|
| 86 |
)
|
| 87 |
assert (
|
| 88 |
grade_route_partition(_metrics(success_rate=0.8), {"route_blocked": True})
|
| 89 |
-
== 0.
|
| 90 |
)
|
| 91 |
|
| 92 |
|
|
@@ -95,7 +95,7 @@ def test_grade_registry_corruption_thresholds() -> None:
|
|
| 95 |
grade_registry_corruption(
|
| 96 |
_metrics(success_rate=0.99), {"registry_auth_matches_default": True}
|
| 97 |
)
|
| 98 |
-
==
|
| 99 |
)
|
| 100 |
assert (
|
| 101 |
grade_registry_corruption(
|
|
@@ -116,7 +116,7 @@ def test_grade_job_generator_runaway_thresholds() -> None:
|
|
| 116 |
grade_job_generator_runaway(
|
| 117 |
_metrics(depth=4), {"job_generator_rate_resolved": True}
|
| 118 |
)
|
| 119 |
-
==
|
| 120 |
)
|
| 121 |
assert (
|
| 122 |
grade_job_generator_runaway(
|
|
|
|
| 32 |
grade_cascading_timeout(
|
| 33 |
_metrics(success_rate=1.0), {"cascading_timeout_resolved": True}
|
| 34 |
)
|
| 35 |
+
== 0.99
|
| 36 |
)
|
| 37 |
assert (
|
| 38 |
grade_cascading_timeout(
|
|
|
|
| 50 |
|
| 51 |
def test_grade_byzantine_queue_fault_cases() -> None:
|
| 52 |
ctx = {"baseline_worker_restart_count": 3}
|
| 53 |
+
assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=3), ctx) == 0.99
|
| 54 |
assert grade_byzantine_queue_fault(_metrics(depth=0, restarts=8), ctx) == 0.6
|
| 55 |
+
assert grade_byzantine_queue_fault(_metrics(depth=40, restarts=10), ctx) == 0.01
|
| 56 |
|
| 57 |
|
| 58 |
def test_grade_distributed_lock_starvation_cases() -> None:
|
|
|
|
| 61 |
|
| 62 |
assert (
|
| 63 |
grade_distributed_lock_starvation(_metrics(depth=2, stalls=0), ctx_unlocked)
|
| 64 |
+
== 0.99
|
| 65 |
)
|
| 66 |
assert (
|
| 67 |
grade_distributed_lock_starvation(_metrics(depth=10, stalls=0), ctx_unlocked)
|
|
|
|
| 69 |
)
|
| 70 |
assert (
|
| 71 |
grade_distributed_lock_starvation(_metrics(depth=10, stalls=3), ctx_locked)
|
| 72 |
+
== 0.01
|
| 73 |
)
|
| 74 |
|
| 75 |
|
| 76 |
def test_grade_backpressure_cascade_continuous() -> None:
|
| 77 |
+
assert grade_backpressure_cascade(_metrics(depth=0), {}) == 0.99
|
| 78 |
assert grade_backpressure_cascade(_metrics(depth=100), {}) == 0.5
|
| 79 |
+
assert grade_backpressure_cascade(_metrics(depth=200), {}) == 0.01
|
| 80 |
|
| 81 |
|
| 82 |
def test_grade_route_partition_threshold() -> None:
|
| 83 |
assert (
|
| 84 |
grade_route_partition(_metrics(success_rate=0.96), {"route_blocked": False})
|
| 85 |
+
== 0.99
|
| 86 |
)
|
| 87 |
assert (
|
| 88 |
grade_route_partition(_metrics(success_rate=0.8), {"route_blocked": True})
|
| 89 |
+
== 0.01
|
| 90 |
)
|
| 91 |
|
| 92 |
|
|
|
|
| 95 |
grade_registry_corruption(
|
| 96 |
_metrics(success_rate=0.99), {"registry_auth_matches_default": True}
|
| 97 |
)
|
| 98 |
+
== 0.99
|
| 99 |
)
|
| 100 |
assert (
|
| 101 |
grade_registry_corruption(
|
|
|
|
| 116 |
grade_job_generator_runaway(
|
| 117 |
_metrics(depth=4), {"job_generator_rate_resolved": True}
|
| 118 |
)
|
| 119 |
+
== 0.99
|
| 120 |
)
|
| 121 |
assert (
|
| 122 |
grade_job_generator_runaway(
|