Spaces:

v4xsh
/

nervousystem-env

Sleeping

App Files Files Community

nervousystem-env / simulation /telemetry.py

vx7sh

feat(curriculum): adaptive difficulty for telemetry, masking, and secondary failures

3928ed0 about 1 month ago

raw

history blame contribute delete

18.5 kB

	"""Telemetry simulation for GPU cluster with NCCL-faithful log formats."""

	from __future__ import annotations

	from enum import Enum
	from random import Random
	from typing import Any

	from app.config import NUM_NODES
	from simulation.cluster import ClusterStateMachine


	class NCCLSubsystem(str, Enum):
	"""Supported NCCL INFO subsystems."""

	NET_IB = "NET/IB"
	GRAPH = "GRAPH/Search"
	WATCHDOG = "Watchdog"
	INIT = "Init"
	TRANSPORT = "Transport"
	SOCKET = "Socket"


	class NCCLLogEngine:
	"""Deterministic NCCL INFO log generator."""

	def __init__(self, seed: int) -> None:
	"""Initialize engine with deterministic randomness."""
	self._seed = seed
	self._random = Random(seed)

	def reset(self, seed: int) -> None:
	"""Reset engine randomness with a new seed."""
	self._seed = seed
	self._random = Random(seed)

	def generate(
	self,
	cluster: ClusterStateMachine,
	subsystem: NCCLSubsystem,
	time_window: int,
	) -> list[str]:
	"""Generate subsystem-specific NCCL INFO logs."""
	window = max(1, time_window)
	failure_type = cluster._scenario.failure_type
	logs: list[str] = []
	for offset in range(window):
	rank_id = offset % NUM_NODES
	if subsystem == NCCLSubsystem.NET_IB:
	logs.extend(self._net_ib_logs(cluster, rank_id, failure_type))
	elif subsystem == NCCLSubsystem.GRAPH:
	logs.extend(self._graph_logs(cluster, rank_id, failure_type, offset))
	elif subsystem == NCCLSubsystem.WATCHDOG:
	logs.extend(self._watchdog_logs(cluster, rank_id, failure_type, offset))
	elif subsystem == NCCLSubsystem.INIT:
	logs.extend(self._init_logs(cluster, rank_id, failure_type))
	elif subsystem == NCCLSubsystem.TRANSPORT:
	logs.extend(self._transport_logs(cluster, rank_id, failure_type))
	elif subsystem == NCCLSubsystem.SOCKET:
	logs.extend(self._socket_logs(cluster, rank_id, failure_type))
	return logs

	def _prefix(self, node_id: int, rank_offset: int = 0) -> str:
	"""Build deterministic hostname/pid/tid prefix for a node."""
	hostname = f"gpu-node-{node_id:02d}"
	pid = 1000 + ((self._seed * 131 + node_id * 977) % 9000)
	tid = pid + rank_offset
	return f"{hostname}:{pid}:{tid}"

	def _line(
	self,
	node_id: int,
	subsystem: NCCLSubsystem,
	message: str,
	rank_offset: int = 0,
	) -> str:
	"""Format one NCCL INFO line in production style."""
	prefix = self._prefix(node_id=node_id, rank_offset=rank_offset)
	return f"{prefix} NCCL INFO {subsystem.value} {message}"

	def _net_ib_logs(
	self, cluster: ClusterStateMachine, rank_id: int, failure_type: str
	) -> list[str]:
	"""Generate NET/IB logs for throughput and interface health."""
	node_id = rank_id % NUM_NODES
	ratio = cluster.training.throughput_tokens_per_sec / max(
	1.0, cluster.training.target_throughput
	)
	current_gbps = 200.0 * ratio
	logs = [
	self._line(
	node_id,
	NCCLSubsystem.NET_IB,
	": Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB",
	rank_offset=rank_id,
	),
	]
	if failure_type in {"congestion", "cascade"}:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.NET_IB,
	(
	f": Link degraded on mlx5_1:1, effective bandwidth "
	f"{current_gbps:.1f} GB/s (< 160.0 GB/s target)"
	),
	rank_offset=rank_id,
	)
	)
	else:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.NET_IB,
	f": Link stable, effective bandwidth {current_gbps:.1f} GB/s",
	rank_offset=rank_id,
	)
	)
	return logs

	def _graph_logs(
	self,
	cluster: ClusterStateMachine,
	rank_id: int,
	failure_type: str,
	offset: int,
	) -> list[str]:
	"""Generate graph search and topology logs."""
	node_id = rank_id % NUM_NODES
	rings = 4
	logs = [
	self._line(
	node_id,
	NCCLSubsystem.GRAPH,
	f": {NUM_NODES} nodes, {NUM_NODES * 8} GPUs, {rings} rings",
	rank_offset=rank_id,
	)
	]
	if failure_type in {"congestion", "cascade"}:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.GRAPH,
	": ring 2 crosses oversubscribed spine links (rack-locality violated)",
	rank_offset=rank_id,
	)
	)
	if failure_type in {"desync", "cascade"}:
	base_seq = 1198 + offset
	drift = (rank_id % 3) - 1
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.GRAPH,
	(
	": compiled different collectives across ranks "
	f"(rank={rank_id}, seq_id={base_seq + drift}, expected={base_seq})"
	),
	rank_offset=rank_id,
	)
	)
	return logs

	def _watchdog_logs(
	self,
	cluster: ClusterStateMachine,
	rank_id: int,
	failure_type: str,
	offset: int,
	) -> list[str]:
	"""Generate watchdog timeout and rank stall diagnostics."""
	node_id = rank_id % NUM_NODES
	failing_rank = cluster._scenario.failing_rank_id
	seq_id = 1198 + offset
	logs: list[str] = []
	if failure_type in {"oom", "desync", "cascade"}:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.WATCHDOG,
	(
	": collective operation timeout on rank "
	f"{failing_rank} (seq_id={seq_id})"
	),
	rank_offset=rank_id,
	)
	)
	if failure_type in {"oom", "cascade"}:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.WATCHDOG,
	f": rank {failing_rank} stalled after CUDA Xid 79 (OOM)",
	rank_offset=rank_id,
	)
	)
	else:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.WATCHDOG,
	": no collective timeout detected in current window",
	rank_offset=rank_id,
	)
	)
	return logs

	def _init_logs(
	self, cluster: ClusterStateMachine, rank_id: int, failure_type: str
	) -> list[str]:
	"""Generate NCCL init and compatibility diagnostics."""
	node_id = rank_id % NUM_NODES
	logs = [
	self._line(
	node_id,
	NCCLSubsystem.INIT,
	": Bootstrap : Using eth0:10.10.0.0<0>",
	rank_offset=rank_id,
	)
	]
	if failure_type == "cascade":
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.INIT,
	(
	": Loaded NCCL 2.21.5 but expected 2.27.0 - "
	"LD_LIBRARY_PATH version mismatch; message truncated errors may occur"
	),
	rank_offset=rank_id,
	)
	)
	return logs

	def _transport_logs(
	self, cluster: ClusterStateMachine, rank_id: int, failure_type: str
	) -> list[str]:
	"""Generate transport-level completion and error logs."""
	node_id = rank_id % NUM_NODES
	logs: list[str] = []
	if failure_type in {"oom", "cascade"}:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.TRANSPORT,
	"Got completion with error 12 (Remote access error)",
	rank_offset=rank_id,
	)
	)
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.TRANSPORT,
	"CUDA driver reported Xid 79 on receive queue",
	rank_offset=rank_id,
	)
	)
	elif failure_type in {"desync", "congestion"}:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.TRANSPORT,
	"send proxy retries increased due to delayed completions",
	rank_offset=rank_id,
	)
	)
	else:
	logs.append(
	self._line(
	node_id,
	NCCLSubsystem.TRANSPORT,
	"all transport channels healthy",
	rank_offset=rank_id,
	)
	)
	return logs

	def _socket_logs(
	self, cluster: ClusterStateMachine, rank_id: int, failure_type: str
	) -> list[str]:
	"""Generate socket connect and retry logs."""
	node_id = rank_id % NUM_NODES
	failing_rank = cluster._scenario.failing_rank_id
	if failure_type in {"oom", "desync", "cascade"}:
	return [
	self._line(
	node_id,
	NCCLSubsystem.SOCKET,
	f"socketPollConnect: Connection refused on rank {failing_rank}",
	rank_offset=rank_id,
	)
	]
	return [
	self._line(
	node_id,
	NCCLSubsystem.SOCKET,
	"socketPollConnect: connection established for all peers",
	rank_offset=rank_id,
	)
	]


	class TelemetryStream:
	"""Simulates surface telemetry and hidden NCCL subsystem diagnostics."""

	def __init__(
	self,
	seed: int,
	red_herring_probability: float = 0.30,
	telemetry_mask_probability: float = 0.20,
	) -> None:
	"""Initialize telemetry stream with deterministic randomness."""
	self._seed = seed
	self._random = Random(seed)
	self._surface_log_buffer: list[str] = []
	self._step = 0
	self._nccl_engine = NCCLLogEngine(seed=seed)
	self._red_herring_probability = red_herring_probability
	self._telemetry_mask_probability = telemetry_mask_probability

	def reset(self, seed: int) -> None:
	"""Reset telemetry state."""
	self._seed = seed
	self._random = Random(seed)
	self._surface_log_buffer = []
	self._step = 0
	self._nccl_engine.reset(seed=seed)

	def configure_difficulty(
	self,
	red_herring_probability: float,
	telemetry_mask_probability: float,
	) -> None:
	"""Apply adaptive curriculum probabilities."""
	self._red_herring_probability = red_herring_probability
	self._telemetry_mask_probability = telemetry_mask_probability

	def update(self, cluster: ClusterStateMachine) -> None:
	"""Update surface telemetry based on cluster state."""
	self._step += 1
	self._surface_log_buffer.clear()
	throughput = cluster.training.throughput_tokens_per_sec
	target = cluster.training.target_throughput
	ratio = throughput / max(1.0, target)
	jitter_pct = self._random.uniform(0.5, 2.5)
	self._surface_log_buffer.append(
	f"cluster_status: training={cluster.training.job_status} step={cluster.training.current_step}"
	)
	self._surface_log_buffer.append(
	f"throughput: {throughput:.1f}/{target:.1f} tok/s ({ratio * 100:.1f}% of target)"
	)
	degraded_nodes = [node.node_id for node in cluster.nodes if node.health_status != "healthy"]
	if self._random.random() < self._telemetry_mask_probability:
	self._surface_log_buffer.append("node_health: all nodes nominal")
	else:
	self._surface_log_buffer.append(
	f"node_health: degraded_nodes={degraded_nodes if degraded_nodes else 'none'}"
	)

	healthy_non_failing_nodes = [
	node.node_id
	for node in cluster.nodes
	if node.health_status == "healthy" and node.node_id != cluster._scenario.failing_node_id
	]
	if (
	healthy_non_failing_nodes
	and self._random.random() < self._red_herring_probability
	):
	random_healthy_node = self._random.choice(healthy_non_failing_nodes)
	self._surface_log_buffer.append(
	"alert: node "
	f"{random_healthy_node} elevated retransmit rate (may be noise)"
	)

	if cluster.training.job_status in {"stalled", "failed"}:
	self._surface_log_buffer.append(
	"alert: distributed collective progress stalled; diagnostics required"
	)
	elif ratio < 0.8:
	self._surface_log_buffer.append(
	"alert: interconnect throughput below expected envelope"
	)
	self._surface_log_buffer.append(
	f"sampling_noise: telemetry jitter {jitter_pct:.2f}%"
	)
	self._surface_log_buffer = self._surface_log_buffer[:7]

	def visible_logs(self) -> list[str]:
	"""Return visible surface logs for the agent."""
	return list(self._surface_log_buffer)

	def generate_nccl_subsystem_logs(
	self,
	cluster: ClusterStateMachine,
	subsystem: str,
	time_window: int = 10,
	) -> list[str]:
	"""Return deep subsystem logs for targeted diagnostics."""
	normalized = subsystem.strip().lower().replace("-", "_").replace("/", "_")
	subsystem_map: dict[str, NCCLSubsystem] = {
	"net_ib": NCCLSubsystem.NET_IB,
	"graph": NCCLSubsystem.GRAPH,
	"watchdog": NCCLSubsystem.WATCHDOG,
	"init": NCCLSubsystem.INIT,
	"transport": NCCLSubsystem.TRANSPORT,
	"socket": NCCLSubsystem.SOCKET,
	}
	selected = subsystem_map.get(normalized)
	if selected is None:
	selected = NCCLSubsystem.WATCHDOG
	return self._nccl_engine.generate(
	cluster=cluster,
	subsystem=selected,
	time_window=time_window,
	)

	def generate_nccl_logs(
	self, cluster: ClusterStateMachine, time_window: int = 10
	) -> list[str]:
	"""Backward-compatible deep logs helper across major subsystems."""
	logs: list[str] = []
	for subsystem in (
	NCCLSubsystem.WATCHDOG,
	NCCLSubsystem.GRAPH,
	NCCLSubsystem.NET_IB,
	):
	logs.extend(
	self._nccl_engine.generate(
	cluster=cluster,
	subsystem=subsystem,
	time_window=max(1, time_window // 3),
	)
	)
	return logs

	def get_investigation_output(
	self,
	cluster: ClusterStateMachine,
	action_type: str,
	rank_id: int \| None = None,
	) -> dict[str, Any]:
	"""Return action-specific deep output with realistic noise."""
	if action_type == "inspect_flight_recorder":
	if rank_id is None:
	return {"error": "rank_id parameter required"}

	payload = cluster._generate_flight_recorder_data(int(rank_id))
	entries = list(payload.get("entries", []))
	wrong_ranks = [
	node.node_id
	for node in cluster.nodes
	if node.node_id != int(rank_id)
	]
	self._random.shuffle(wrong_ranks)
	noise_count = self._random.randint(1, 2)

	for wrong_rank in wrong_ranks[:noise_count]:
	noise_entry = {
	"profiling_name": "nccl:all_reduce",
	"rank": wrong_rank,
	"collective_seq_id": 2000 + wrong_rank,
	"p2p_seq_id": 0,
	"op_id": 2000 + wrong_rank,
	"state": "completed",
	"input_sizes": [[1024, 2048]],
	"output_sizes": [[1024, 2048]],
	"input_dtypes": ["Float"],
	"output_dtypes": ["Float"],
	"timeout_ms": 1800000,
	"time_created_ns": (self._seed * 1_000_000) + (wrong_rank * 10_000),
	"time_started_ns": (self._seed * 1_000_000) + (wrong_rank * 10_000) + 100,
	"time_finished_ns": (self._seed * 1_000_000) + (wrong_rank * 10_000) + 600,
	"frames": [
	{
	"name": "all_reduce",
	"filename": "torch/distributed/distributed_c10d.py",
	"line": 2891,
	}
	],
	}
	entries.append(noise_entry)

	payload["entries"] = entries
	return {"flight_recorder": payload}

	if action_type == "query_nccl_logs":
	logs = self.generate_nccl_subsystem_logs(cluster, "watchdog", time_window=10)
	wrong_rank_candidates = [
	node.node_id
	for node in cluster.nodes
	if node.node_id != cluster._scenario.failing_rank_id
	]
	wrong_rank = (
	self._random.choice(wrong_rank_candidates)
	if wrong_rank_candidates
	else cluster._scenario.failing_rank_id
	)
	misleading = [
	(
	f"[t-{i}][rank{wrong_rank}] NCCL INFO Watchdog: "
	"brief timeout observed; auto-recovered in 12ms"
	)
	for i in range(3, 0, -1)
	]
	return {"nccl_logs": [misleading, logs]}

	return {"error": "unsupported investigation action"}