Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

feather-runtime / overlay /harness /eval_agent.py

Jackoatmon

Update Feather h200 training runtime image

e317e25 verified 26 days ago

raw

history blame contribute delete

5.35 kB

	"""Eval agent: parse run.log and extract metrics from training runs."""
	import re
	from dataclasses import dataclass, field


	@dataclass
	class ExperimentResult:
	"""Parsed result from a single experiment run.

	All float fields default to 0.0; integer fields default to 0.
	The ``crashed`` flag is set when the log indicates a failure or the
	log file is missing entirely.
	"""

	# Primary metric
	val_bpb: float = 0.0

	# Timing
	training_seconds: float = 0.0
	total_seconds: float = 0.0

	# Hardware
	peak_vram_mb: float = 0.0
	mfu_percent: float = 0.0

	# Throughput
	total_tokens_m: float = 0.0
	num_steps: int = 0

	# Model shape (echoed by train.py summary block)
	num_params_m: float = 0.0
	n_layer: int = 0
	d_model: int = 0

	# Secondary health metrics
	mhc_spectral_norm: float = 0.0
	engram_hit_rate: float = 0.0
	sr_bypass_rate: float = 0.0

	# Status
	crashed: bool = False
	error_message: str = ""


	# Regex patterns keyed by ExperimentResult attribute name.
	# Format must match the ``--- Summary ---`` block printed by train.py.
	_PATTERNS: dict[str, str] = {
	"val_bpb": r"^val_bpb:\s+([\d.]+)",
	"training_seconds": r"^training_seconds:\s+([\d.]+)",
	"total_seconds": r"^total_seconds:\s+([\d.]+)",
	"peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
	"mfu_percent": r"^mfu_percent:\s+([\d.]+)",
	"total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
	"num_steps": r"^num_steps:\s+(\d+)",
	"num_params_m": r"^num_params_M:\s+([\d.]+)",
	"n_layer": r"^n_layer:\s+(\d+)",
	"d_model": r"^d_model:\s+(\d+)",
	"mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
	"engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
	"sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
	}

	# Attributes that should be parsed as int rather than float.
	_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})


	def parse_run_log(log_path: str) -> ExperimentResult:
	"""Parse a run.log file and extract all training metrics.

	Args:
	log_path: Absolute path to the run.log file.

	Returns:
	Populated ExperimentResult; sets ``crashed=True`` when the log
	contains a traceback or the file is missing.
	"""
	result = ExperimentResult()

	try:
	with open(log_path) as fh:
	content = fh.read()
	except FileNotFoundError:
	result.crashed = True
	result.error_message = f"Log file not found: {log_path}"
	return result

	# Detect crash signals in output.
	if "Traceback" in content or "FAIL" in content or "Error" in content:
	result.crashed = True
	lines = content.strip().splitlines()
	result.error_message = "\n".join(lines[-20:])

	for attr, pattern in _PATTERNS.items():
	match = re.search(pattern, content, re.MULTILINE)
	if match:
	raw = match.group(1)
	setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))

	return result


	def check_secondary_alarms(result: ExperimentResult) -> list[str]:
	"""Check secondary metrics against fixed alarm thresholds.

	Args:
	result: Parsed experiment result.

	Returns:
	List of human-readable alarm strings (empty if all clear).
	"""
	alarms: list[str] = []

	if result.mhc_spectral_norm > 2.0:
	alarms.append(
	f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
	)
	if 0 < result.engram_hit_rate < 0.1:
	alarms.append(
	f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
	)
	if 0 < result.mfu_percent < 10:
	alarms.append(
	f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
	)

	return alarms


	def should_keep(
	result: ExperimentResult,
	best_bpb: float,
	gates: dict \| None = None,
	) -> tuple[bool, str]:
	"""Decide whether to keep or discard an experiment.

	The primary criterion is strictly lower val_bpb than the current best.
	Optional secondary gates (passed from HarnessConfig.secondary_metrics)
	can reject an otherwise-improving result.

	Args:
	result: Parsed experiment result.
	best_bpb: Current best val_bpb across all experiments.
	gates: Optional dict mapping metric name to threshold dict with
	``"max"`` or ``"min"`` keys, e.g.
	``{"mhc_spectral_norm": {"max": 2.0}}``.

	Returns:
	Tuple of (keep: bool, reason: str).
	"""
	if result.crashed:
	return False, "crash"
	if result.val_bpb <= 0:
	return False, "invalid val_bpb"
	if result.val_bpb >= best_bpb:
	return False, "discard"

	# Secondary gate checks.
	if gates:
	gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
	if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
	return (
	False,
	f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
	)
	gate_engram = gates.get("engram_hit_rate", {}).get("min")
	if gate_engram is not None and result.engram_hit_rate < gate_engram:
	return (
	False,
	f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
	)

	return True, "keep"