icarus112's picture
Upload folder using huggingface_hub
2c30a29 verified
"""Eval agent: parse run.log and extract metrics from training runs."""
import re
from dataclasses import dataclass, field
@dataclass
class ExperimentResult:
"""Parsed result from a single experiment run.
All float fields default to 0.0; integer fields default to 0.
The ``crashed`` flag is set when the log indicates a failure or the
log file is missing entirely.
"""
# Primary metric
val_bpb: float = 0.0
# Timing
training_seconds: float = 0.0
total_seconds: float = 0.0
# Hardware
peak_vram_mb: float = 0.0
mfu_percent: float = 0.0
# Throughput
total_tokens_m: float = 0.0
num_steps: int = 0
# Model shape (echoed by train.py summary block)
num_params_m: float = 0.0
n_layer: int = 0
d_model: int = 0
# Secondary health metrics
mhc_spectral_norm: float = 0.0
engram_hit_rate: float = 0.0
sr_bypass_rate: float = 0.0
# Status
crashed: bool = False
error_message: str = ""
# Regex patterns keyed by ExperimentResult attribute name.
# Format must match the ``--- Summary ---`` block printed by train.py.
_PATTERNS: dict[str, str] = {
"val_bpb": r"^val_bpb:\s+([\d.]+)",
"training_seconds": r"^training_seconds:\s+([\d.]+)",
"total_seconds": r"^total_seconds:\s+([\d.]+)",
"peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
"mfu_percent": r"^mfu_percent:\s+([\d.]+)",
"total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
"num_steps": r"^num_steps:\s+(\d+)",
"num_params_m": r"^num_params_M:\s+([\d.]+)",
"n_layer": r"^n_layer:\s+(\d+)",
"d_model": r"^d_model:\s+(\d+)",
"mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
"engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
"sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
}
# Attributes that should be parsed as int rather than float.
_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
def parse_run_log(log_path: str) -> ExperimentResult:
"""Parse a run.log file and extract all training metrics.
Args:
log_path: Absolute path to the run.log file.
Returns:
Populated ExperimentResult; sets ``crashed=True`` when the log
contains a traceback or the file is missing.
"""
result = ExperimentResult()
try:
with open(log_path) as fh:
content = fh.read()
except FileNotFoundError:
result.crashed = True
result.error_message = f"Log file not found: {log_path}"
return result
# Detect crash signals in output.
if "Traceback" in content or "FAIL" in content or "Error" in content:
result.crashed = True
lines = content.strip().splitlines()
result.error_message = "\n".join(lines[-20:])
for attr, pattern in _PATTERNS.items():
match = re.search(pattern, content, re.MULTILINE)
if match:
raw = match.group(1)
setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
return result
def check_secondary_alarms(result: ExperimentResult) -> list[str]:
"""Check secondary metrics against fixed alarm thresholds.
Args:
result: Parsed experiment result.
Returns:
List of human-readable alarm strings (empty if all clear).
"""
alarms: list[str] = []
if result.mhc_spectral_norm > 2.0:
alarms.append(
f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
)
if 0 < result.engram_hit_rate < 0.1:
alarms.append(
f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
)
if 0 < result.mfu_percent < 10:
alarms.append(
f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
)
return alarms
def should_keep(
result: ExperimentResult,
best_bpb: float,
gates: dict | None = None,
) -> tuple[bool, str]:
"""Decide whether to keep or discard an experiment.
The primary criterion is strictly lower val_bpb than the current best.
Optional secondary gates (passed from HarnessConfig.secondary_metrics)
can reject an otherwise-improving result.
Args:
result: Parsed experiment result.
best_bpb: Current best val_bpb across all experiments.
gates: Optional dict mapping metric name to threshold dict with
``"max"`` or ``"min"`` keys, e.g.
``{"mhc_spectral_norm": {"max": 2.0}}``.
Returns:
Tuple of (keep: bool, reason: str).
"""
if result.crashed:
return False, "crash"
if result.val_bpb <= 0:
return False, "invalid val_bpb"
if result.val_bpb >= best_bpb:
return False, "discard"
# Secondary gate checks.
if gates:
gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
return (
False,
f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
)
gate_engram = gates.get("engram_hit_rate", {}).get("min")
if gate_engram is not None and result.engram_hit_rate < gate_engram:
return (
False,
f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
)
return True, "keep"