Spaces:

Naseer-010
/

DIME

Configuration error

App Files Files Community

Naseer-010 commited on 23 days ago

Commit

54da37b

1 Parent(s): c1ba9ce

added canonical evaluation harness, unified DIME index, deterministic replay guarantees

Browse files

Files changed (17) hide show

agents/__init__.py +8 -0
agents/base_agent.py +18 -0
agents/heuristic_agent.py +79 -0
agents/random_agent.py +50 -0
agents/threshold_agent.py +28 -0
benchmark/__init__.py +5 -0
benchmark/benchmark_config.py +104 -0
benchmark/benchmark_registry.py +105 -0
benchmark/deterministic.py +173 -0
benchmark/dime_index.py +182 -0
benchmark/evaluation_harness.py +399 -0
benchmark/statistical_report.py +95 -0
benchmark/utils.py +126 -0
server/environment.py +49 -7
server/tasks.py +5 -4
server/trace_loader.py +3 -3
train_grpo_unsloth.py +1 -10

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Baseline agents for the DIME benchmark."""
+from agents.base_agent import BaseAgent
+from agents.heuristic_agent import HeuristicAgent
+from agents.random_agent import RandomAgent
+from agents.threshold_agent import ThresholdAgent
+__all__ = ["BaseAgent", "RandomAgent", "HeuristicAgent", "ThresholdAgent"]

agents/base_agent.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""Canonical agent interface for DIME evaluation."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Any
+class BaseAgent(ABC):
+    """Minimal inference-only interface required by the benchmark harness."""
+    def reset(self, seed: int | None = None, task_id: str | None = None) -> None:
+        """Reset per-episode agent state."""
+    @abstractmethod
+    def act(self, observation: Any) -> Any:
+        """Return an action for the current observation."""
+        raise NotImplementedError

agents/heuristic_agent.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Canonical symbolic DIME baseline using the existing triage tree."""
+from __future__ import annotations
+from typing import Any
+from agents.base_agent import BaseAgent
+from benchmark.utils import observation_to_dict
+from server.command_parser import CommandParseError, parse_command
+from server.models import InfraAction
+def expected_triage_command(observation: Any) -> str:
+    """Return the kubectl command mandated by the DIME triage tree."""
+    obs = observation_to_dict(observation)
+    cpu = obs.get("cpu_loads", [0.3] * 8)
+    mem = obs.get("mem_utilizations", [0.2] * 8)
+    failed = set(obs.get("failed_nodes", []) or [])
+    io_wait = float(obs.get("io_wait", 0.0) or 0.0)
+    p99 = float(obs.get("p99_latency", 0.0) or 0.0)
+    request_rate = float(obs.get("request_rate", 100.0) or 100.0)
+    error_budget = float(obs.get("error_budget", 100.0) or 100.0)
+    for idx, memory in enumerate(mem):
+        if float(memory) > 0.92:
+            return f"kubectl delete pod node-{idx}"
+    if 0 in failed:
+        return "kubectl delete pod node-0"
+    if io_wait > 0.80:
+        return "kubectl throttle ingress --rate=0.5"
+    workers = [(idx, float(load)) for idx, load in enumerate(cpu[1:], 1) if float(load) >= 0.0]
+    if workers:
+        avg_worker_cpu = sum(load for _, load in workers) / len(workers)
+        for idx, load in workers:
+            if load > 0.90 and avg_worker_cpu < 0.60:
+                candidates = [candidate for candidate, _ in workers if candidate != idx and candidate not in failed]
+                if candidates:
+                    dst = min(candidates, key=lambda node_idx: float(cpu[node_idx]))
+                    return f"kubectl exec -it istio-proxy -- traffic shift --from={idx} --to={dst}"
+    if p99 > 100.0 and request_rate > 150.0:
+        return "kubectl throttle ingress --rate=0.4"
+    for idx, load in workers:
+        if 0.0 <= load < 0.10 and p99 > 100.0:
+            dst = next(
+                (candidate for candidate, candidate_load in workers if candidate_load > 0.2 and candidate not in failed and candidate != idx),
+                None,
+            )
+            if dst is not None:
+                return f"kubectl exec -it istio-proxy -- traffic shift --from={idx} --to={dst}"
+    if len(failed) >= 2:
+        return "kubectl throttle ingress --rate=0.3"
+    db_cpu = float(cpu[0]) if cpu and float(cpu[0]) >= 0.0 else 0.0
+    if db_cpu > 0.80:
+        return "kubectl throttle ingress --rate=0.7"
+    if workers and sum(load for _, load in workers) / len(workers) > 0.75 and error_budget > 20.0:
+        return "kubectl scale deployment frontend --replicas=10"
+    return "no_op"
+class HeuristicAgent(BaseAgent):
+    """Rule-based symbolic SRE baseline."""
+    def act(self, observation: Any) -> InfraAction:
+        command = expected_triage_command(observation)
+        if command == "no_op":
+            return InfraAction(action_type="no_op")
+        try:
+            return parse_command(command)
+        except CommandParseError:
+            return InfraAction(action_type="no_op")

agents/random_agent.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Seeded random DIME baseline."""
+from __future__ import annotations
+import random
+from typing import Any
+from agents.base_agent import BaseAgent
+from benchmark.utils import observation_to_dict
+from server.models import InfraAction
+class RandomAgent(BaseAgent):
+    """Uniformly sample valid DIME management actions with deterministic seeding."""
+    def __init__(self, seed: int = 0) -> None:
+        self._base_seed = seed
+        self._rng = random.Random(seed)
+    def reset(self, seed: int | None = None, task_id: str | None = None) -> None:
+        self._rng = random.Random(self._base_seed if seed is None else seed)
+    def act(self, observation: Any) -> InfraAction:
+        obs = observation_to_dict(observation)
+        node_count = max(1, len(obs.get("cpu_loads", []) or [0]))
+        action_type = self._rng.choice(
+            ["restart_node", "reroute_traffic", "throttle", "scale_up", "no_op"]
+        )
+        if action_type == "restart_node":
+            failed = list(obs.get("failed_nodes", []) or [])
+            target = int(self._rng.choice(failed)) if failed else self._rng.randrange(node_count)
+            return InfraAction(action_type="restart_node", target=target)
+        if action_type == "reroute_traffic" and node_count > 1:
+            src = self._rng.randrange(node_count)
+            dst_choices = [idx for idx in range(node_count) if idx != src]
+            return InfraAction(
+                action_type="reroute_traffic",
+                from_node=src,
+                to_node=self._rng.choice(dst_choices),
+            )
+        if action_type == "throttle":
+            return InfraAction(action_type="throttle", rate=self._rng.choice([0.3, 0.5, 0.7, 0.9]))
+        if action_type == "scale_up":
+            return InfraAction(action_type="scale_up")
+        return InfraAction(action_type="no_op")

agents/threshold_agent.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""Classical threshold automation baseline for DIME."""
+from __future__ import annotations
+from typing import Any
+from agents.base_agent import BaseAgent
+from benchmark.utils import observation_to_dict
+from server.models import InfraAction
+class ThresholdAgent(BaseAgent):
+    """Reactive autoscaler-style baseline."""
+    def act(self, observation: Any) -> InfraAction:
+        obs = observation_to_dict(observation)
+        cpu_loads = [float(v) for v in obs.get("cpu_loads", []) if float(v) >= 0.0]
+        avg_cpu = sum(cpu_loads) / len(cpu_loads) if cpu_loads else 0.0
+        latency = float(obs.get("latency_ms", 0.0) or 0.0)
+        failed_nodes = list(obs.get("failed_nodes", []) or [])
+        if avg_cpu > 0.80:
+            return InfraAction(action_type="scale_up")
+        if latency > 100.0:
+            return InfraAction(action_type="throttle", rate=0.7)
+        if failed_nodes:
+            return InfraAction(action_type="restart_node", target=int(failed_nodes[0]))
+        return InfraAction(action_type="no_op")

benchmark/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Canonical DIME benchmark package."""
+from benchmark.benchmark_config import DIME_V1_CONFIG
+__all__ = ["DIME_V1_CONFIG"]

benchmark/benchmark_config.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""Immutable DIME-v1.0 benchmark definition."""
+from __future__ import annotations
+from dataclasses import dataclass
+from types import MappingProxyType
+from typing import Mapping
+from benchmark.benchmark_registry import task_registry_snapshot
+@dataclass(frozen=True)
+class EvaluationProtocol:
+    """Locked evaluation protocol for a benchmark version."""
+    episodes_per_task: int
+    seeds: tuple[int, ...]
+    inference_only: bool
+    disable_online_learning: bool
+@dataclass(frozen=True)
+class DeterministicPolicy:
+    """Seed and replay requirements."""
+    seed_components: tuple[str, ...]
+    torch_deterministic: bool
+    trace_wraparound: str
+    replay_validation_required: bool
+@dataclass(frozen=True)
+class TopologyConstraints:
+    """Allowed constrained topology templates for DIME-v1.0."""
+    node_count: int
+    database_node: int
+    templates: tuple[str, ...]
+    app_nodes: tuple[int, ...]
+@dataclass(frozen=True)
+class BenchmarkConfig:
+    """Frozen benchmark-critical configuration."""
+    benchmark_name: str
+    benchmark_version: str
+    task_registry: Mapping[str, tuple[str, ...]]
+    evaluation_protocol: EvaluationProtocol
+    metric_weights: Mapping[str, float]
+    normalization_method: Mapping[str, object]
+    deterministic_policy: DeterministicPolicy
+    topology_constraints: TopologyConstraints
+_METRIC_WEIGHTS = MappingProxyType(
+    {
+        "uptime": 0.35,
+        "latency_score": 0.25,
+        "throughput": 0.20,
+        "recovery_speed": 0.10,
+        "cost_efficiency": 0.10,
+    }
+)
+_NORMALIZATION_METHOD = MappingProxyType(
+    {
+        "latency": "auto",
+        "latency_candidates": ("inverse_minmax", "smooth_exponential"),
+        "target_latency_ms": 50.0,
+        "max_latency_ms": 500.0,
+        "latency_scale_ms": 100.0,
+        "max_allowed_recovery_time": 10.0,
+        "max_budget": "episode_initial_cloud_budget",
+        "selection_persistence": "run_config_snapshot",
+    }
+)
+DIME_V1_CONFIG = BenchmarkConfig(
+    benchmark_name="DIME",
+    benchmark_version="DIME-v1.0",
+    task_registry=MappingProxyType(dict(task_registry_snapshot(include_hidden=True))),
+    evaluation_protocol=EvaluationProtocol(
+        episodes_per_task=100,
+        seeds=tuple(range(100)),
+        inference_only=True,
+        disable_online_learning=True,
+    ),
+    metric_weights=_METRIC_WEIGHTS,
+    normalization_method=_NORMALIZATION_METHOD,
+    deterministic_policy=DeterministicPolicy(
+        seed_components=("seed", "task", "topology_template", "trace_offset"),
+        torch_deterministic=True,
+        trace_wraparound="(step + trace_offset) % trace_length",
+        replay_validation_required=True,
+    ),
+    topology_constraints=TopologyConstraints(
+        node_count=8,
+        database_node=0,
+        templates=("default", "app_ring", "dense_mesh", "sampled_mesh"),
+        app_nodes=(1, 2, 3, 4, 5, 6, 7),
+    ),
+)

benchmark/benchmark_registry.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Frozen task split registry for DIME-v1.0."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Iterable, Mapping
+class Split(str, Enum):
+    """Canonical benchmark split names."""
+    TRAIN = "train"
+    VALIDATION = "validation"
+    HIDDEN_EVAL = "hidden_eval"
+@dataclass(frozen=True)
+class TaskSpec:
+    """One immutable benchmark task entry."""
+    registry_id: str
+    task_id: str
+    split: Split
+    curriculum_level: int
+    topology_template: str = "default"
+    trace_offset: int = 0
+    tags: tuple[str, ...] = field(default_factory=tuple)
+    @property
+    def reset_kwargs(self) -> dict[str, object]:
+        return {
+            "task": self.task_id,
+            "curriculum_level": self.curriculum_level,
+            "topology_template": self.topology_template,
+            "trace_offset": self.trace_offset,
+        }
+TRAIN_TASKS: tuple[TaskSpec, ...] = (
+    TaskSpec("train.level_1_read_logs", "level_1_read_logs", Split.TRAIN, 1),
+    TaskSpec("train.traffic_spike", "traffic_spike", Split.TRAIN, 2),
+    TaskSpec("train.node_failure", "node_failure", Split.TRAIN, 2),
+    TaskSpec("train.cascading_failure", "cascading_failure", Split.TRAIN, 3),
+)
+VALIDATION_TASKS: tuple[TaskSpec, ...] = (
+    TaskSpec("validation.flash_crowd", "flash_crowd", Split.VALIDATION, 4),
+    TaskSpec("validation.thundering_herd", "thundering_herd", Split.VALIDATION, 5, trace_offset=17),
+    TaskSpec("validation.zombie_node", "zombie_node", Split.VALIDATION, 5, trace_offset=41),
+    TaskSpec("validation.hot_shard_skew", "hot_shard_skew", Split.VALIDATION, 5, trace_offset=73),
+)
+_HIDDEN_EVAL_TASKS: tuple[TaskSpec, ...] = (
+    TaskSpec("hidden.retry_storm.default.011", "retry_storm", Split.HIDDEN_EVAL, 5, "default", 11, ("trace",)),
+    TaskSpec("hidden.black_swan.default.029", "black_swan_az_failure", Split.HIDDEN_EVAL, 5, "default", 29, ("trace",)),
+    TaskSpec("hidden.connection_pool.default.053", "connection_pool_deadlock", Split.HIDDEN_EVAL, 5, "default", 53, ("trace",)),
+    TaskSpec("hidden.autoscaler.default.089", "autoscaler_flapping_trap", Split.HIDDEN_EVAL, 5, "default", 89, ("trace",)),
+    TaskSpec("hidden.retry_storm.ring.137", "retry_storm", Split.HIDDEN_EVAL, 5, "app_ring", 137, ("topology_variant", "trace")),
+    TaskSpec("hidden.black_swan.dense.211", "black_swan_az_failure", Split.HIDDEN_EVAL, 5, "dense_mesh", 211, ("topology_variant", "trace")),
+    TaskSpec("hidden.connection_pool.ring.307", "connection_pool_deadlock", Split.HIDDEN_EVAL, 5, "app_ring", 307, ("topology_variant", "trace")),
+    TaskSpec("hidden.autoscaler.sampled.401", "autoscaler_flapping_trap", Split.HIDDEN_EVAL, 5, "sampled_mesh", 401, ("topology_variant", "trace")),
+)
+def get_training_task_ids() -> tuple[str, ...]:
+    """Return only tasks permitted for RL training."""
+    return tuple(task.task_id for task in TRAIN_TASKS)
+def get_public_task_specs(split: Split | str) -> tuple[TaskSpec, ...]:
+    """Return non-hidden task specs for public training/tuning use."""
+    split_value = Split(split)
+    if split_value is Split.TRAIN:
+        return TRAIN_TASKS
+    if split_value is Split.VALIDATION:
+        return VALIDATION_TASKS
+    raise PermissionError("hidden_eval tasks require the official benchmark harness")
+def get_benchmark_task_specs(split: Split | str) -> tuple[TaskSpec, ...]:
+    """Return task specs for the official evaluation harness."""
+    split_value = Split(split)
+    if split_value is Split.HIDDEN_EVAL:
+        return _HIDDEN_EVAL_TASKS
+    return get_public_task_specs(split_value)
+def task_registry_snapshot(include_hidden: bool = True) -> Mapping[str, tuple[str, ...]]:
+    """Immutable split-to-registry-id snapshot for benchmark configs."""
+    snapshot: dict[str, tuple[str, ...]] = {
+        Split.TRAIN.value: tuple(task.registry_id for task in TRAIN_TASKS),
+        Split.VALIDATION.value: tuple(task.registry_id for task in VALIDATION_TASKS),
+    }
+    if include_hidden:
+        snapshot[Split.HIDDEN_EVAL.value] = tuple(task.registry_id for task in _HIDDEN_EVAL_TASKS)
+    return snapshot
+def iter_all_specs(include_hidden: bool = True) -> Iterable[TaskSpec]:
+    """Iterate registered specs; hidden specs are opt-in."""
+    yield from TRAIN_TASKS
+    yield from VALIDATION_TASKS
+    if include_hidden:
+        yield from _HIDDEN_EVAL_TASKS

benchmark/deterministic.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""Deterministic replay controls and validation for DIME."""
+from __future__ import annotations
+import argparse
+import random
+from dataclasses import dataclass
+from typing import Any
+import numpy as np
+from agents.base_agent import BaseAgent
+from agents.heuristic_agent import HeuristicAgent
+from benchmark.utils import action_to_dict, observation_to_dict
+from server.environment import DistributedInfraEnvironment
+from server.models import InfraAction
+def set_global_seed(seed: int) -> None:
+    """Seed Python, NumPy, and torch if installed."""
+    random.seed(seed)
+    np.random.seed(seed)
+    try:
+        import torch
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+        try:
+            torch.use_deterministic_algorithms(True, warn_only=True)
+        except TypeError:
+            torch.use_deterministic_algorithms(True)
+        if hasattr(torch.backends, "cudnn"):
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+    except ImportError:
+        return
+@dataclass(frozen=True)
+class ReplayValidationResult:
+    """Outcome from deterministic replay validation."""
+    passed: bool
+    task_id: str
+    seed: int
+    topology_template: str
+    trace_offset: int
+    steps: int
+def _reset_agent(agent: Any, seed: int, task_id: str) -> None:
+    reset = getattr(agent, "reset", None)
+    if reset is None:
+        return
+    try:
+        reset(seed=seed, task_id=task_id)
+    except TypeError:
+        reset()
+def _coerce_action(action: Any) -> InfraAction:
+    if isinstance(action, InfraAction):
+        return action
+    if isinstance(action, dict):
+        try:
+            return InfraAction.model_validate(action)
+        except Exception:
+            return InfraAction(action_type="no_op")
+    return InfraAction(action_type="no_op")
+def _run_replay(
+    agent: BaseAgent,
+    *,
+    task_id: str,
+    seed: int,
+    topology_template: str,
+    trace_offset: int,
+) -> dict[str, Any]:
+    set_global_seed(seed)
+    _reset_agent(agent, seed, task_id)
+    env = DistributedInfraEnvironment()
+    obs = env.reset(
+        seed=seed,
+        task=task_id,
+        topology_template=topology_template,
+        trace_offset=trace_offset,
+    )
+    trajectory: list[dict[str, Any]] = []
+    rewards: list[float] = []
+    while True:
+        action = _coerce_action(agent.act(obs))
+        obs = env.step(action)
+        obs_dict = observation_to_dict(obs)
+        rewards.append(float(obs_dict.get("reward", 0.0) or 0.0))
+        trajectory.append(
+            {
+                "action": action_to_dict(action),
+                "reward": rewards[-1],
+                "latency_ms": obs_dict.get("latency_ms"),
+                "failed_nodes": obs_dict.get("failed_nodes", []),
+                "step": obs_dict.get("step"),
+            }
+        )
+        if bool(obs_dict.get("done", False)) or env.sim.step_count >= env.sim.max_steps:
+            break
+    return {
+        "rewards": rewards,
+        "latency_history": list(env.sim.latency_history),
+        "failure_history": [row["failed_nodes"] for row in trajectory],
+        "trajectory": trajectory,
+    }
+def validate_replay(
+    agent: BaseAgent | None = None,
+    task_id: str = "traffic_spike",
+    seed: int = 42,
+    topology_template: str = "default",
+    trace_offset: int = 0,
+) -> ReplayValidationResult:
+    """Run identical seeds twice and fail if deterministic replay diverges."""
+    active_agent = agent or HeuristicAgent()
+    first = _run_replay(
+        active_agent,
+        task_id=task_id,
+        seed=seed,
+        topology_template=topology_template,
+        trace_offset=trace_offset,
+    )
+    second = _run_replay(
+        active_agent,
+        task_id=task_id,
+        seed=seed,
+        topology_template=topology_template,
+        trace_offset=trace_offset,
+    )
+    if first != second:
+        raise AssertionError(
+            "Deterministic replay diverged for "
+            f"seed={seed}, task={task_id}, topology={topology_template}, trace_offset={trace_offset}"
+        )
+    return ReplayValidationResult(
+        passed=True,
+        task_id=task_id,
+        seed=seed,
+        topology_template=topology_template,
+        trace_offset=trace_offset,
+        steps=len(first["trajectory"]),
+    )
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Validate deterministic DIME replay.")
+    parser.add_argument("--task", default="traffic_spike")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--topology-template", default="default")
+    parser.add_argument("--trace-offset", type=int, default=0)
+    args = parser.parse_args()
+    result = validate_replay(
+        task_id=args.task,
+        seed=args.seed,
+        topology_template=args.topology_template,
+        trace_offset=args.trace_offset,
+    )
+    print(result)
+if __name__ == "__main__":
+    main()

benchmark/dime_index.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""Official DIME Index calculation."""
+from __future__ import annotations
+import math
+from collections import defaultdict
+from statistics import mean, variance
+from typing import Any, Iterable, Mapping
+from benchmark.benchmark_config import DIME_V1_CONFIG
+from benchmark.utils import clamp
+LATENCY_INVERSE_MINMAX = "inverse_minmax"
+LATENCY_SMOOTH_EXPONENTIAL = "smooth_exponential"
+def latency_inverse_minmax(
+    latency_ms: float,
+    *,
+    target_latency: float = 50.0,
+    max_latency: float = 500.0,
+) -> float:
+    """Method A: inverse min-max latency normalization."""
+    denom = max(max_latency - target_latency, 1e-9)
+    return 1.0 - clamp((float(latency_ms) - target_latency) / denom)
+def latency_smooth_exponential(latency_ms: float, *, latency_scale: float = 100.0) -> float:
+    """Method B: smooth exponential latency normalization."""
+    return clamp(math.exp(-float(latency_ms) / max(latency_scale, 1e-9)))
+def normalize_latency(
+    latency_ms: float,
+    method: str,
+    config: Mapping[str, Any] | None = None,
+) -> float:
+    """Normalize latency to [0, 1] with the selected method."""
+    norm = config or DIME_V1_CONFIG.normalization_method
+    if method == LATENCY_INVERSE_MINMAX:
+        return latency_inverse_minmax(
+            latency_ms,
+            target_latency=float(norm.get("target_latency_ms", 50.0)),
+            max_latency=float(norm.get("max_latency_ms", 500.0)),
+        )
+    if method == LATENCY_SMOOTH_EXPONENTIAL:
+        return latency_smooth_exponential(
+            latency_ms,
+            latency_scale=float(norm.get("latency_scale_ms", 100.0)),
+        )
+    raise ValueError(f"Unknown latency normalization method: {method}")
+def normalize_metrics(
+    metrics: Mapping[str, Any],
+    *,
+    latency_method: str,
+    config: Mapping[str, Any] | None = None,
+) -> dict[str, float]:
+    """Normalize canonical DIME metrics to [0, 1]."""
+    norm = config or DIME_V1_CONFIG.normalization_method
+    mttr = float(metrics.get("mttr", metrics.get("MTTR", 0.0)) or 0.0)
+    recovery_window = float(norm.get("max_allowed_recovery_time", 10.0))
+    resource_cost = float(metrics.get("resource_cost", 0.0) or 0.0)
+    max_budget = float(metrics.get("max_budget", metrics.get("initial_cloud_budget", 1.0)) or 1.0)
+    return {
+        "uptime": clamp(float(metrics.get("uptime", metrics.get("uptime_ratio", 0.0)) or 0.0)),
+        "latency_score": normalize_latency(
+            float(metrics.get("p99_latency", metrics.get("latency_ms", 0.0)) or 0.0),
+            latency_method,
+            norm,
+        ),
+        "throughput": clamp(float(metrics.get("throughput", metrics.get("throughput_ratio", 0.0)) or 0.0)),
+        "recovery_speed": 1.0 - clamp(mttr / max(recovery_window, 1e-9)),
+        "cost_efficiency": 1.0 - clamp(resource_cost / max(max_budget, 1e-9)),
+    }
+def compute_dime_index(
+    metrics: Mapping[str, Any],
+    config_snapshot: Mapping[str, Any] | None = None,
+    *,
+    latency_method: str | None = None,
+) -> dict[str, float]:
+    """Compute the official DIME Index and normalized metric breakdown."""
+    snapshot = config_snapshot or {}
+    method = latency_method or str(snapshot.get("selected_latency_method") or LATENCY_SMOOTH_EXPONENTIAL)
+    norm = snapshot.get("normalization_method") if isinstance(snapshot, Mapping) else None
+    normalized = normalize_metrics(metrics, latency_method=method, config=norm)
+    weights = (
+        snapshot.get("metric_weights")
+        if isinstance(snapshot, Mapping) and "metric_weights" in snapshot
+        else DIME_V1_CONFIG.metric_weights
+    )
+    score = sum(float(weights[key]) * normalized[key] for key in normalized)
+    return {"dime_index": round(clamp(score), 6), **{k: round(v, 6) for k, v in normalized.items()}}
+def _rank(values: list[float]) -> list[int]:
+    order = sorted(range(len(values)), key=lambda idx: values[idx])
+    ranks = [0] * len(values)
+    for rank, idx in enumerate(order):
+        ranks[idx] = rank
+    return ranks
+def _pearson(a: list[float], b: list[float]) -> float:
+    if len(a) < 2 or len(b) < 2:
+        return 0.0
+    mean_a = mean(a)
+    mean_b = mean(b)
+    numerator = sum((x - mean_a) * (y - mean_b) for x, y in zip(a, b))
+    denom_a = math.sqrt(sum((x - mean_a) ** 2 for x in a))
+    denom_b = math.sqrt(sum((y - mean_b) ** 2 for y in b))
+    if denom_a == 0.0 or denom_b == 0.0:
+        return 0.0
+    return numerator / (denom_a * denom_b)
+def _method_quality(records: list[Mapping[str, Any]], method: str) -> dict[str, float]:
+    latencies = [float(r.get("p99_latency", r.get("latency_ms", 0.0)) or 0.0) for r in records]
+    scores = [normalize_latency(lat, method) for lat in latencies]
+    outcomes = [
+        float(r.get("task_success", 0.0) or 0.0) + float(r.get("task_score", 0.0) or 0.0)
+        for r in records
+    ]
+    rank_consistency = abs(_pearson([float(v) for v in _rank(scores)], [float(v) for v in _rank(outcomes)]))
+    var = variance(scores) if len(scores) > 1 else 0.0
+    variance_stability = 1.0 / (1.0 + var)
+    sorted_scores = [score for _, score in sorted(zip(latencies, scores), key=lambda item: item[0])]
+    jumps = [abs(b - a) for a, b in zip(sorted_scores, sorted_scores[1:])]
+    smoothness = 1.0 / (1.0 + (max(jumps) if jumps else 0.0))
+    by_task: dict[str, list[float]] = defaultdict(list)
+    for record, score in zip(records, scores):
+        by_task[str(record.get("task_id", record.get("task", "unknown")))].append(score)
+    task_means = [mean(values) for values in by_task.values() if values]
+    between = variance(task_means) if len(task_means) > 1 else 0.0
+    within_values = []
+    for values in by_task.values():
+        if len(values) > 1:
+            within_values.append(variance(values))
+    within = mean(within_values) if within_values else 0.0
+    separability = clamp(between / (between + within + 1e-9))
+    aggregate = mean([rank_consistency, variance_stability, smoothness, separability])
+    return {
+        "ranking_consistency": round(rank_consistency, 6),
+        "variance_stability": round(variance_stability, 6),
+        "score_smoothness": round(smoothness, 6),
+        "task_separability": round(separability, 6),
+        "aggregate": round(aggregate, 6),
+    }
+def select_latency_normalization(records: Iterable[Mapping[str, Any]]) -> dict[str, Any]:
+    """Evaluate both latency normalization candidates and select the better one."""
+    data = list(records)
+    if not data:
+        return {
+            "selected_method": LATENCY_SMOOTH_EXPONENTIAL,
+            "method_scores": {
+                LATENCY_INVERSE_MINMAX: {"aggregate": 0.0},
+                LATENCY_SMOOTH_EXPONENTIAL: {"aggregate": 0.0},
+            },
+        }
+    method_scores = {
+        LATENCY_INVERSE_MINMAX: _method_quality(data, LATENCY_INVERSE_MINMAX),
+        LATENCY_SMOOTH_EXPONENTIAL: _method_quality(data, LATENCY_SMOOTH_EXPONENTIAL),
+    }
+    selected = max(
+        method_scores,
+        key=lambda method: (
+            method_scores[method]["aggregate"],
+            method_scores[method].get("score_smoothness", 0.0),
+        ),
+    )
+    return {"selected_method": selected, "method_scores": method_scores}

benchmark/evaluation_harness.py ADDED Viewed

	@@ -0,0 +1,399 @@

+"""Official DIME-v1.0 benchmark evaluation harness."""
+from __future__ import annotations
+import argparse
+import contextlib
+import time
+from pathlib import Path
+from typing import Any, Callable, Iterable, Mapping
+import requests
+from agents.base_agent import BaseAgent
+from agents.heuristic_agent import HeuristicAgent
+from agents.random_agent import RandomAgent
+from agents.threshold_agent import ThresholdAgent
+from benchmark.benchmark_config import BenchmarkConfig, DIME_V1_CONFIG
+from benchmark.benchmark_registry import Split, TaskSpec, get_benchmark_task_specs
+from benchmark.deterministic import set_global_seed
+from benchmark.dime_index import compute_dime_index, select_latency_normalization
+from benchmark.statistical_report import build_statistical_report, persist_statistical_report
+from benchmark.utils import (
+    BENCHMARK_RUNS_DIR,
+    SEED_LOGS_DIR,
+    STATISTICAL_REPORTS_DIR,
+    action_to_dict,
+    append_jsonl,
+    atomic_write_json,
+    ensure_result_dirs,
+    observation_to_dict,
+    to_plain_data,
+    utc_run_id,
+    write_csv,
+)
+from server.environment import DistributedInfraEnvironment
+from server.models import InfraAction
+class CallableAgent(BaseAgent):
+    """Adapter for local callables that return InfraAction-compatible data."""
+    def __init__(self, fn: Callable[[Any], Any]) -> None:
+        self._fn = fn
+    def act(self, observation: Any) -> Any:
+        return self._fn(observation)
+class APIAgent(BaseAgent):
+    """Adapter for API agents that accept JSON observations and return actions."""
+    def __init__(self, endpoint: str, timeout_s: float = 30.0) -> None:
+        self.endpoint = endpoint
+        self.timeout_s = timeout_s
+    def act(self, observation: Any) -> Any:
+        response = requests.post(
+            self.endpoint,
+            json={"observation": observation_to_dict(observation)},
+            timeout=self.timeout_s,
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return payload.get("action", payload)
+class ReplayAgent(BaseAgent):
+    """Replay pre-recorded actions for deterministic trajectory checks."""
+    def __init__(self, actions: Iterable[Mapping[str, Any]]) -> None:
+        self._actions = [dict(action) for action in actions]
+        self._idx = 0
+    def reset(self, seed: int | None = None, task_id: str | None = None) -> None:
+        self._idx = 0
+    def act(self, observation: Any) -> Any:
+        if self._idx >= len(self._actions):
+            return InfraAction(action_type="no_op")
+        action = self._actions[self._idx]
+        self._idx += 1
+        return action
+def _resolve_agent(agent: str | BaseAgent | Callable[[Any], Any]) -> BaseAgent:
+    if isinstance(agent, BaseAgent):
+        return agent
+    if callable(agent) and not isinstance(agent, str):
+        return CallableAgent(agent)
+    if agent == "random":
+        return RandomAgent()
+    if agent == "heuristic":
+        return HeuristicAgent()
+    if agent == "threshold":
+        return ThresholdAgent()
+    if isinstance(agent, str) and agent.startswith("http"):
+        return APIAgent(agent)
+    raise ValueError(f"Unknown agent specifier: {agent!r}")
+def _coerce_action(action: Any) -> InfraAction:
+    if isinstance(action, InfraAction):
+        return action
+    if isinstance(action, Mapping):
+        try:
+            return InfraAction.model_validate(dict(action))
+        except Exception:
+            return InfraAction(action_type="no_op")
+    return InfraAction(action_type="no_op")
+def _reset_agent(agent: BaseAgent, seed: int, task_id: str) -> None:
+    try:
+        agent.reset(seed=seed, task_id=task_id)
+    except TypeError:
+        agent.reset()
+@contextlib.contextmanager
+def _inference_only(agent: BaseAgent):
+    """Block common online-learning mutation entrypoints during evaluation."""
+    patched: list[tuple[Any, str, Any]] = []
+    def disabled(*args: Any, **kwargs: Any) -> None:
+        raise RuntimeError("DIME benchmark evaluation is inference-only")
+    names = (
+        "backward",
+        "learn",
+        "optimize",
+        "optimizer_step",
+        "policy_update",
+        "rollout",
+        "train_step",
+        "update",
+        "update_policy",
+    )
+    for name in names:
+        if hasattr(agent, name):
+            patched.append((agent, name, getattr(agent, name)))
+            setattr(agent, name, disabled)
+    for owner_name in ("optimizer", "optim", "replay_buffer"):
+        owner = getattr(agent, owner_name, None)
+        if owner is None:
+            continue
+        for name in ("step", "add", "append", "extend", "push", "update"):
+            if hasattr(owner, name):
+                patched.append((owner, name, getattr(owner, name)))
+                setattr(owner, name, disabled)
+    if hasattr(agent, "eval"):
+        try:
+            agent.eval()
+        except TypeError:
+            pass
+    if hasattr(agent, "train"):
+        try:
+            agent.train(False)
+        except TypeError:
+            pass
+    try:
+        try:
+            import torch
+            with torch.inference_mode():
+                yield
+        except ImportError:
+            yield
+    finally:
+        for owner, name, original in reversed(patched):
+            setattr(owner, name, original)
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return 0.0
+    ordered = sorted(float(v) for v in values)
+    idx = min(len(ordered) - 1, max(0, int(round((pct / 100.0) * (len(ordered) - 1)))))
+    return ordered[idx]
+def _mttr(uptime_history: list[float]) -> float:
+    durations: list[int] = []
+    current = 0
+    for uptime in uptime_history:
+        if uptime < 1.0:
+            current += 1
+        elif current:
+            durations.append(current)
+            current = 0
+    if current:
+        durations.append(current)
+    return sum(durations) / len(durations) if durations else 0.0
+def _episode_metrics(
+    env: DistributedInfraEnvironment,
+    rewards: list[float],
+    task_score: float,
+    initial_cloud_budget: int,
+) -> dict[str, float]:
+    sim = env.sim
+    uptime = sum(sim.uptime_history) / len(sim.uptime_history) if sim.uptime_history else 0.0
+    throughput = sim.total_requests_served / max(1, sim.total_requests_received)
+    alive = sum(1 for node in sim.nodes if not node.is_failed)
+    total = max(1, len(sim.nodes))
+    resource_cost = max(0, initial_cloud_budget - sim.cloud_budget)
+    return {
+        "uptime": uptime,
+        "p99_latency": _percentile(sim.latency_history, 99.0),
+        "throughput": throughput,
+        "throughput_ratio": throughput,
+        "mttr": _mttr(sim.uptime_history),
+        "resource_cost": float(resource_cost),
+        "max_budget": float(max(1, initial_cloud_budget)),
+        "initial_cloud_budget": float(initial_cloud_budget),
+        "survival_rate": alive / total,
+        "cumulative_reward": sum(rewards),
+        "task_success": 1.0 if task_score >= 0.8 else 0.0,
+        "task_score": task_score,
+    }
+def _config_snapshot(config: BenchmarkConfig, selected_latency_method: str | None = None) -> dict[str, Any]:
+    snapshot = to_plain_data(config)
+    if selected_latency_method is not None:
+        snapshot["selected_latency_method"] = selected_latency_method
+    return snapshot
+def _run_episode(
+    agent: BaseAgent,
+    spec: TaskSpec,
+    seed: int,
+    *,
+    run_dir: Path,
+) -> dict[str, Any]:
+    set_global_seed(seed)
+    _reset_agent(agent, seed=seed, task_id=spec.task_id)
+    env = DistributedInfraEnvironment()
+    obs = env.reset(seed=seed, episode_id=f"{spec.registry_id}:{seed}", **spec.reset_kwargs)
+    initial_cloud_budget = env.sim.cloud_budget
+    trajectory: list[dict[str, Any]] = [
+        {"event": "reset", "seed": seed, "task_id": spec.task_id, "registry_id": spec.registry_id, "observation": observation_to_dict(obs)}
+    ]
+    rewards: list[float] = []
+    task_score = float(getattr(obs, "task_score", 0.0) or 0.0)
+    start = time.perf_counter()
+    while True:
+        action = _coerce_action(agent.act(obs))
+        obs = env.step(action)
+        obs_dict = observation_to_dict(obs)
+        reward = float(obs_dict.get("reward", 0.0) or 0.0)
+        rewards.append(reward)
+        task_score = float(obs_dict.get("task_score", task_score) or task_score)
+        trajectory.append(
+            {
+                "event": "step",
+                "step": obs_dict.get("step"),
+                "action": action_to_dict(action),
+                "reward": reward,
+                "done": bool(obs_dict.get("done", False)),
+                "task_score": task_score,
+                "observation": obs_dict,
+            }
+        )
+        if bool(obs_dict.get("done", False)) or env.sim.step_count >= env.sim.max_steps:
+            break
+    elapsed_s = time.perf_counter() - start
+    raw_path = run_dir / "trajectories" / spec.registry_id / f"seed_{seed:03d}.jsonl"
+    append_jsonl(raw_path, trajectory)
+    seed_log_path = SEED_LOGS_DIR / f"{run_dir.name}_{spec.registry_id}_seed_{seed:03d}.json"
+    metrics = _episode_metrics(env, rewards, task_score, initial_cloud_budget)
+    row = {
+        "benchmark_version": DIME_V1_CONFIG.benchmark_version,
+        "registry_id": spec.registry_id,
+        "task_id": spec.task_id,
+        "split": spec.split.value,
+        "seed": seed,
+        "topology_template": spec.topology_template,
+        "trace_offset": spec.trace_offset,
+        "steps": env.sim.step_count,
+        "elapsed_s": round(elapsed_s, 6),
+        "trajectory_path": str(raw_path),
+        **metrics,
+    }
+    atomic_write_json(seed_log_path, row)
+    return row
+def run_benchmark(
+    agent: str | BaseAgent | Callable[[Any], Any],
+    benchmark_version: str = "DIME-v1.0",
+    split: str = "hidden_eval",
+) -> dict[str, Any]:
+    """Run the official DIME benchmark and persist all artifacts."""
+    if benchmark_version != DIME_V1_CONFIG.benchmark_version:
+        raise ValueError(f"Unsupported benchmark version: {benchmark_version}")
+    ensure_result_dirs()
+    active_agent = _resolve_agent(agent)
+    split_value = Split(split)
+    specs = get_benchmark_task_specs(split_value)
+    config = DIME_V1_CONFIG
+    seeds = config.evaluation_protocol.seeds
+    if len(seeds) != config.evaluation_protocol.episodes_per_task:
+        raise RuntimeError("DIME-v1.0 requires exactly 100 seeds for 100 episodes per task")
+    run_id = utc_run_id(f"{benchmark_version}_{split_value.value}")
+    run_dir = BENCHMARK_RUNS_DIR / run_id
+    run_dir.mkdir(parents=True, exist_ok=True)
+    atomic_write_json(run_dir / "benchmark_config.initial.json", _config_snapshot(config))
+    episode_rows: list[dict[str, Any]] = []
+    with _inference_only(active_agent):
+        for spec in specs:
+            for seed in seeds:
+                episode_rows.append(_run_episode(active_agent, spec, seed, run_dir=run_dir))
+    latency_selection = select_latency_normalization(episode_rows)
+    selected_method = latency_selection["selected_method"]
+    final_config_snapshot = _config_snapshot(config, selected_method)
+    final_config_snapshot["latency_method_selection"] = latency_selection
+    scored_rows: list[dict[str, Any]] = []
+    for row in episode_rows:
+        score_payload = compute_dime_index(row, final_config_snapshot)
+        scored_rows.append({**row, **score_payload})
+    report = build_statistical_report(scored_rows)
+    summary = {
+        "run_id": run_id,
+        "benchmark_version": benchmark_version,
+        "split": split_value.value,
+        "episodes_per_task": config.evaluation_protocol.episodes_per_task,
+        "num_tasks": len(specs),
+        "num_episodes": len(scored_rows),
+        "selected_latency_method": selected_method,
+        "latency_method_selection": latency_selection,
+        "mean_dime_index": report["episodes"]["dime_index"]["mean"],
+        "artifact_dir": str(run_dir),
+    }
+    atomic_write_json(run_dir / "benchmark_config.snapshot.json", final_config_snapshot)
+    atomic_write_json(run_dir / "benchmark_summary.json", summary)
+    atomic_write_json(run_dir / "episode_metrics.json", scored_rows)
+    write_csv(
+        run_dir / "episode_metrics.csv",
+        scored_rows,
+        [
+            "benchmark_version",
+            "registry_id",
+            "task_id",
+            "split",
+            "seed",
+            "topology_template",
+            "trace_offset",
+            "steps",
+            "dime_index",
+            "uptime",
+            "latency_score",
+            "throughput",
+            "recovery_speed",
+            "cost_efficiency",
+            "p99_latency",
+            "mttr",
+            "resource_cost",
+            "cumulative_reward",
+            "task_success",
+            "survival_rate",
+            "task_score",
+        ],
+    )
+    persist_statistical_report(
+        report,
+        STATISTICAL_REPORTS_DIR / f"{run_id}.json",
+        STATISTICAL_REPORTS_DIR / f"{run_id}.csv",
+    )
+    atomic_write_json(run_dir / "statistical_report.json", report)
+    return {"summary": summary, "report": report, "run_dir": str(run_dir)}
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run the official DIME-v1.0 benchmark.")
+    parser.add_argument("--agent", default="heuristic", help="random, heuristic, threshold, or an HTTP endpoint")
+    parser.add_argument("--split", default="hidden_eval", choices=[split.value for split in Split])
+    parser.add_argument("--benchmark-version", default="DIME-v1.0")
+    args = parser.parse_args()
+    result = run_benchmark(args.agent, benchmark_version=args.benchmark_version, split=args.split)
+    print(result["summary"])
+if __name__ == "__main__":
+    main()

benchmark/statistical_report.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Statistical reporting for DIME benchmark runs."""
+from __future__ import annotations
+import math
+from collections import defaultdict
+from pathlib import Path
+from statistics import mean, variance
+from typing import Any, Iterable, Mapping
+from benchmark.utils import atomic_write_json, write_csv
+DEFAULT_REPORT_METRICS = (
+    "dime_index",
+    "uptime",
+    "latency_score",
+    "throughput",
+    "recovery_speed",
+    "cost_efficiency",
+    "p99_latency",
+    "mttr",
+    "cumulative_reward",
+    "task_success",
+    "survival_rate",
+)
+def summarize_values(values: Iterable[float]) -> dict[str, float]:
+    """Compute benchmark-grade summary statistics."""
+    data = [float(v) for v in values]
+    if not data:
+        return {"n": 0, "mean": 0.0, "std": 0.0, "variance": 0.0, "min": 0.0, "max": 0.0, "ci95_low": 0.0, "ci95_high": 0.0}
+    mu = mean(data)
+    var = variance(data) if len(data) > 1 else 0.0
+    std = math.sqrt(var)
+    half_width = 1.96 * std / math.sqrt(len(data)) if len(data) > 1 else 0.0
+    return {
+        "n": len(data),
+        "mean": round(mu, 6),
+        "std": round(std, 6),
+        "variance": round(var, 6),
+        "min": round(min(data), 6),
+        "max": round(max(data), 6),
+        "ci95_low": round(mu - half_width, 6),
+        "ci95_high": round(mu + half_width, 6),
+    }
+def _summarize_records(records: list[Mapping[str, Any]], metrics: tuple[str, ...]) -> dict[str, dict[str, float]]:
+    report: dict[str, dict[str, float]] = {}
+    for metric in metrics:
+        values = [float(row[metric]) for row in records if metric in row and row[metric] is not None]
+        report[metric] = summarize_values(values)
+    return report
+def build_statistical_report(
+    records: Iterable[Mapping[str, Any]],
+    *,
+    metrics: tuple[str, ...] = DEFAULT_REPORT_METRICS,
+) -> dict[str, Any]:
+    """Build summaries across episodes, tasks, and seeds."""
+    data = list(records)
+    by_task: dict[str, list[Mapping[str, Any]]] = defaultdict(list)
+    by_seed: dict[str, list[Mapping[str, Any]]] = defaultdict(list)
+    for row in data:
+        by_task[str(row.get("task_id", row.get("task", "unknown")))].append(row)
+        by_seed[str(row.get("seed", "unknown"))].append(row)
+    return {
+        "episodes": _summarize_records(data, metrics),
+        "tasks": {task: _summarize_records(rows, metrics) for task, rows in sorted(by_task.items())},
+        "seeds": {seed: _summarize_records(rows, metrics) for seed, rows in sorted(by_seed.items())},
+    }
+def persist_statistical_report(report: Mapping[str, Any], json_path: Path, csv_path: Path) -> None:
+    """Persist statistical report as JSON and long-form CSV."""
+    atomic_write_json(json_path, report)
+    rows: list[dict[str, Any]] = []
+    for group, group_payload in report.items():
+        if group == "episodes":
+            for metric, stats in group_payload.items():
+                rows.append({"group": group, "key": "all", "metric": metric, **stats})
+        else:
+            for key, metric_payload in group_payload.items():
+                for metric, stats in metric_payload.items():
+                    rows.append({"group": group, "key": key, "metric": metric, **stats})
+    write_csv(
+        csv_path,
+        rows,
+        ["group", "key", "metric", "n", "mean", "std", "variance", "min", "max", "ci95_low", "ci95_high"],
+    )

benchmark/utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""Utility helpers for canonical DIME benchmark runs."""
+from __future__ import annotations
+import json
+import os
+import tempfile
+from dataclasses import fields, is_dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterable, Mapping
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+RESULTS_ROOT = PROJECT_ROOT / "results"
+BENCHMARK_RUNS_DIR = RESULTS_ROOT / "benchmark_runs"
+SEED_LOGS_DIR = RESULTS_ROOT / "seed_logs"
+STATISTICAL_REPORTS_DIR = RESULTS_ROOT / "statistical_reports"
+def clamp(value: float, lower: float = 0.0, upper: float = 1.0) -> float:
+    """Return ``value`` clipped to the closed interval [lower, upper]."""
+    return max(lower, min(upper, float(value)))
+def ensure_result_dirs() -> None:
+    """Create benchmark artifact directories if they are missing."""
+    for path in (BENCHMARK_RUNS_DIR, SEED_LOGS_DIR, STATISTICAL_REPORTS_DIR):
+        path.mkdir(parents=True, exist_ok=True)
+def utc_run_id(prefix: str = "dime") -> str:
+    """Stable UTC run identifier with second-level precision."""
+    stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    return f"{prefix}_{stamp}"
+def to_plain_data(value: Any) -> Any:
+    """Convert dataclasses, Pydantic models, paths, and tuples to JSON data."""
+    if is_dataclass(value):
+        return {field.name: to_plain_data(getattr(value, field.name)) for field in fields(value)}
+    if hasattr(value, "model_dump"):
+        return to_plain_data(value.model_dump())
+    if isinstance(value, Mapping):
+        return {str(k): to_plain_data(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [to_plain_data(v) for v in value]
+    if isinstance(value, Path):
+        return str(value)
+    return value
+def atomic_write_json(path: Path, payload: Any) -> None:
+    """Atomically write JSON so interrupted runs do not corrupt artifacts."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_name = tempfile.mkstemp(prefix=path.name, dir=str(path.parent))
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as fh:
+            json.dump(to_plain_data(payload), fh, indent=2, sort_keys=True)
+            fh.write("\n")
+        os.replace(tmp_name, path)
+    except Exception:
+        try:
+            os.unlink(tmp_name)
+        finally:
+            raise
+def append_jsonl(path: Path, records: Iterable[Mapping[str, Any]]) -> None:
+    """Append JSONL records to ``path``."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as fh:
+        for record in records:
+            fh.write(json.dumps(to_plain_data(record), sort_keys=True) + "\n")
+def write_csv(path: Path, rows: Iterable[Mapping[str, Any]], fieldnames: list[str]) -> None:
+    """Write a small CSV without bringing in pandas as a runtime dependency."""
+    import csv
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", newline="", encoding="utf-8") as fh:
+        writer = csv.DictWriter(fh, fieldnames=fieldnames, extrasaction="ignore")
+        writer.writeheader()
+        for row in rows:
+            writer.writerow({k: to_plain_data(v) for k, v in row.items()})
+def observation_to_dict(observation: Any) -> dict[str, Any]:
+    """Normalize a DIME observation model or mapping to a plain dict."""
+    if isinstance(observation, Mapping):
+        return dict(observation)
+    if hasattr(observation, "model_dump"):
+        return observation.model_dump()
+    keys = [
+        "cpu_loads",
+        "mem_utilizations",
+        "queue_lengths",
+        "failed_nodes",
+        "latency_ms",
+        "request_rate",
+        "io_wait",
+        "p99_latency",
+        "error_budget",
+        "step",
+        "task_hint",
+        "task_score",
+        "done",
+        "reward",
+        "cloud_budget",
+        "action_errors",
+    ]
+    return {key: getattr(observation, key) for key in keys if hasattr(observation, key)}
+def action_to_dict(action: Any) -> dict[str, Any]:
+    """Normalize an InfraAction-like object or mapping to a plain dict."""
+    if isinstance(action, Mapping):
+        return dict(action)
+    if hasattr(action, "model_dump"):
+        return action.model_dump(exclude_none=True)
+    return {
+        key: getattr(action, key)
+        for key in ("action_type", "target", "from_node", "to_node", "rate", "raw_command")
+        if hasattr(action, key) and getattr(action, key) is not None
+    }

server/environment.py CHANGED Viewed

@@ -102,10 +102,13 @@ class SimulationState:
     # --- Trace replay ---
     trace_replay: Any = None  # Optional[TraceReplay]
     last_trace_p99_latency: float = 0.0
     last_trace_node_0_io: float = 0.0
     scenario: str = ""  # task-specific chaos scenario overlay
     _black_swan_applied: bool = False
     # --- Throughput tracking (anti-exploit) ---
     total_requests_received: int = 0
@@ -122,15 +125,20 @@ class SimulationState:
 # ---------------------------------------------------------------------------
-def _build_default_graph(n: int = 8) -> Tuple[List[Node], Dict[int, List[int]]]:
-    """Create a default graph with node roles: node 0 = Database, rest = App Servers."""
     nodes = []
     for i in range(n):
         if i == 0:
             # Database node: higher capacity, single point of failure
             nodes.append(
                 Node(
-                    cpu_util=0.20 + random.uniform(-0.03, 0.03),
                     capacity=25,
                     role="database",
                 )
@@ -138,7 +146,7 @@ def _build_default_graph(n: int = 8) -> Tuple[List[Node], Dict[int, List[int]]]:
         else:
             nodes.append(
                 Node(
-                    cpu_util=0.25 + random.uniform(-0.05, 0.05),
                     capacity=15,
                     role="app_server",
                 )
@@ -150,17 +158,41 @@ def _build_default_graph(n: int = 8) -> Tuple[List[Node], Dict[int, List[int]]]:
     for i in range(1, n):
         adjacency[0].append(i)
         adjacency[i].append(0)
-    # App servers: ring + skip connections among themselves
     for i in range(1, n):
         right = 1 + (i % (n - 1))  # wrap within app server range
         if right not in adjacency[i]:
             adjacency[i].append(right)
             adjacency[right].append(i)
         skip = 1 + ((i + 1) % (n - 1))
         if skip not in adjacency[i] and skip != right:
             adjacency[i].append(skip)
             adjacency[skip].append(i)
     return nodes, adjacency
@@ -218,6 +250,9 @@ class DistributedInfraEnvironment(Environment):
         task_id = kwargs.get("task", kwargs.get("task_id", "traffic_spike"))
         curriculum_level = int(kwargs.get("curriculum_level", 0))
         # Auto-detect curriculum level from task_id if not explicitly given
         if curriculum_level == 0:
@@ -233,7 +268,11 @@ class DistributedInfraEnvironment(Environment):
             }
             curriculum_level = _level_map.get(task_id, 2)
-        nodes, adjacency = _build_default_graph(8)
         self._sim = SimulationState(
             nodes=nodes,
             adjacency=adjacency,
@@ -250,6 +289,9 @@ class DistributedInfraEnvironment(Environment):
             curriculum_level=curriculum_level,
             cloud_budget=max(5, 15 - curriculum_level * 2),  # harder = tighter budget
             error_budget=100.0,
         )
         # Apply task-specific setup
@@ -488,7 +530,7 @@ class DistributedInfraEnvironment(Environment):
         # --- Trace replay: override request rate from real data ---
         if sim.trace_replay is not None:
-            trace_step = sim.trace_replay.get_step(sim.step_count)
             sim.current_request_rate = trace_step.request_rate
             sim.last_trace_p99_latency = float(
                 getattr(trace_step, "p99_latency", 0.0) or 0.0

     # --- Trace replay ---
     trace_replay: Any = None  # Optional[TraceReplay]
+    trace_offset: int = 0
+    trace_offset_locked: bool = False
     last_trace_p99_latency: float = 0.0
     last_trace_node_0_io: float = 0.0
     scenario: str = ""  # task-specific chaos scenario overlay
     _black_swan_applied: bool = False
+    topology_template: str = "default"
     # --- Throughput tracking (anti-exploit) ---
     total_requests_received: int = 0
 # ---------------------------------------------------------------------------
+def _build_default_graph(
+    n: int = 8,
+    rng: Optional[random.Random] = None,
+    topology_template: str = "default",
+) -> Tuple[List[Node], Dict[int, List[int]]]:
+    """Create a constrained graph with node 0 = Database and rest = App Servers."""
+    rng = rng or random
     nodes = []
     for i in range(n):
         if i == 0:
             # Database node: higher capacity, single point of failure
             nodes.append(
                 Node(
+                    cpu_util=0.20 + rng.uniform(-0.03, 0.03),
                     capacity=25,
                     role="database",
                 )
         else:
             nodes.append(
                 Node(
+                    cpu_util=0.25 + rng.uniform(-0.05, 0.05),
                     capacity=15,
                     role="app_server",
                 )
     for i in range(1, n):
         adjacency[0].append(i)
         adjacency[i].append(0)
+    # App servers: deterministic constrained templates built from the same
+    # DB-star/ring physics. No arbitrary graph generation is introduced.
     for i in range(1, n):
         right = 1 + (i % (n - 1))  # wrap within app server range
         if right not in adjacency[i]:
             adjacency[i].append(right)
             adjacency[right].append(i)
+    if topology_template == "app_ring":
+        return nodes, adjacency
+    if topology_template == "sampled_mesh":
+        for i in range(1, n):
+            candidates = [j for j in range(1, n) if j != i and j not in adjacency[i]]
+            if candidates:
+                peer = rng.choice(candidates)
+                adjacency[i].append(peer)
+                adjacency[peer].append(i)
+        return nodes, adjacency
+    # Default and dense_mesh retain the historical skip-link shape.
+    for i in range(1, n):
+        right = 1 + (i % (n - 1))
         skip = 1 + ((i + 1) % (n - 1))
         if skip not in adjacency[i] and skip != right:
             adjacency[i].append(skip)
             adjacency[skip].append(i)
+    if topology_template == "dense_mesh":
+        for i in range(1, n):
+            extra = 1 + ((i + 2) % (n - 1))
+            if extra != i and extra not in adjacency[i]:
+                adjacency[i].append(extra)
+                adjacency[extra].append(i)
     return nodes, adjacency
         task_id = kwargs.get("task", kwargs.get("task_id", "traffic_spike"))
         curriculum_level = int(kwargs.get("curriculum_level", 0))
+        topology_template = str(kwargs.get("topology_template", "default"))
+        trace_offset_arg = kwargs.get("trace_offset", None)
+        trace_offset = int(trace_offset_arg) if trace_offset_arg is not None else 0
         # Auto-detect curriculum level from task_id if not explicitly given
         if curriculum_level == 0:
             }
             curriculum_level = _level_map.get(task_id, 2)
+        nodes, adjacency = _build_default_graph(
+            8,
+            rng=self._rng,
+            topology_template=topology_template,
+        )
         self._sim = SimulationState(
             nodes=nodes,
             adjacency=adjacency,
             curriculum_level=curriculum_level,
             cloud_budget=max(5, 15 - curriculum_level * 2),  # harder = tighter budget
             error_budget=100.0,
+            trace_offset=trace_offset,
+            trace_offset_locked=trace_offset_arg is not None,
+            topology_template=topology_template,
         )
         # Apply task-specific setup
         # --- Trace replay: override request rate from real data ---
         if sim.trace_replay is not None:
+            trace_step = sim.trace_replay.get_step(sim.step_count, offset=sim.trace_offset)
             sim.current_request_rate = trace_step.request_rate
             sim.last_trace_p99_latency = float(
                 getattr(trace_step, "p99_latency", 0.0) or 0.0

server/tasks.py CHANGED Viewed

@@ -331,10 +331,11 @@ def _setup_alibaba_trace(env: "DistributedInfraEnvironment", rng: "random.Random
     trace = load_default_trace()
     if trace is not None:
         sim.trace_replay = trace
-        # Start replay from a random offset to vary episodes
-        offset = rng.randint(0, max(1, len(trace) - sim.max_steps))
-        # We store offset in step_count adjustment — trace_loader wraps around
-        sim.current_request_rate = trace.get_step(offset).request_rate
     else:
         # Fallback: synthetic 2x traffic if trace not generated
         sim.current_request_rate = sim.base_request_rate * 2.0

     trace = load_default_trace()
     if trace is not None:
         sim.trace_replay = trace
+        # Start replay from a deterministic benchmark offset when provided,
+        # otherwise preserve the existing stochastic task variation.
+        if not sim.trace_offset_locked:
+            sim.trace_offset = rng.randint(0, max(1, len(trace) - sim.max_steps))
+        sim.current_request_rate = trace.get_step(0, offset=sim.trace_offset).request_rate
     else:
         # Fallback: synthetic 2x traffic if trace not generated
         sim.current_request_rate = sim.base_request_rate * 2.0

server/trace_loader.py CHANGED Viewed

@@ -71,11 +71,11 @@ class TraceReplay:
     def __len__(self) -> int:
         return len(self._steps)
-    def get_step(self, step: int) -> TraceStep:
-        """Get trace data for a given step. Wraps around."""
         if not self._steps:
             return TraceStep()
-        return self._steps[step % len(self._steps)]
 # ---------------------------------------------------------------------------

     def __len__(self) -> int:
         return len(self._steps)
+    def get_step(self, step: int, offset: int = 0) -> TraceStep:
+        """Get trace data for a given step plus deterministic offset. Wraps around."""
         if not self._steps:
             return TraceStep()
+        return self._steps[(int(step) + int(offset)) % len(self._steps)]
 # ---------------------------------------------------------------------------

train_grpo_unsloth.py CHANGED Viewed

@@ -101,19 +101,10 @@ MAX_COMPLETION_LENGTH = (
 SAVE_STEPS = 100
 ALL_TASKS = [
     "traffic_spike",
     "node_failure",
     "cascading_failure",
-    "flash_crowd",
-    "thundering_herd",
-    "zombie_node",
-    "hot_shard_skew",
-    "memory_leak_slow_burn",
-    "split_brain_io_bottleneck",
-    "black_swan_az_failure",
-    "retry_storm",
-    "connection_pool_deadlock",
-    "autoscaler_flapping_trap",
 ]
 # ---------------------------------------------------------------------------

 SAVE_STEPS = 100
 ALL_TASKS = [
+    "level_1_read_logs",
     "traffic_spike",
     "node_failure",
     "cascading_failure",
 ]
 # ---------------------------------------------------------------------------