File size: 5,353 Bytes
2c30a29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | """Eval agent: parse run.log and extract metrics from training runs."""
import re
from dataclasses import dataclass, field
@dataclass
class ExperimentResult:
"""Parsed result from a single experiment run.
All float fields default to 0.0; integer fields default to 0.
The ``crashed`` flag is set when the log indicates a failure or the
log file is missing entirely.
"""
# Primary metric
val_bpb: float = 0.0
# Timing
training_seconds: float = 0.0
total_seconds: float = 0.0
# Hardware
peak_vram_mb: float = 0.0
mfu_percent: float = 0.0
# Throughput
total_tokens_m: float = 0.0
num_steps: int = 0
# Model shape (echoed by train.py summary block)
num_params_m: float = 0.0
n_layer: int = 0
d_model: int = 0
# Secondary health metrics
mhc_spectral_norm: float = 0.0
engram_hit_rate: float = 0.0
sr_bypass_rate: float = 0.0
# Status
crashed: bool = False
error_message: str = ""
# Regex patterns keyed by ExperimentResult attribute name.
# Format must match the ``--- Summary ---`` block printed by train.py.
_PATTERNS: dict[str, str] = {
"val_bpb": r"^val_bpb:\s+([\d.]+)",
"training_seconds": r"^training_seconds:\s+([\d.]+)",
"total_seconds": r"^total_seconds:\s+([\d.]+)",
"peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
"mfu_percent": r"^mfu_percent:\s+([\d.]+)",
"total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
"num_steps": r"^num_steps:\s+(\d+)",
"num_params_m": r"^num_params_M:\s+([\d.]+)",
"n_layer": r"^n_layer:\s+(\d+)",
"d_model": r"^d_model:\s+(\d+)",
"mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
"engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
"sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
}
# Attributes that should be parsed as int rather than float.
_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
def parse_run_log(log_path: str) -> ExperimentResult:
"""Parse a run.log file and extract all training metrics.
Args:
log_path: Absolute path to the run.log file.
Returns:
Populated ExperimentResult; sets ``crashed=True`` when the log
contains a traceback or the file is missing.
"""
result = ExperimentResult()
try:
with open(log_path) as fh:
content = fh.read()
except FileNotFoundError:
result.crashed = True
result.error_message = f"Log file not found: {log_path}"
return result
# Detect crash signals in output.
if "Traceback" in content or "FAIL" in content or "Error" in content:
result.crashed = True
lines = content.strip().splitlines()
result.error_message = "\n".join(lines[-20:])
for attr, pattern in _PATTERNS.items():
match = re.search(pattern, content, re.MULTILINE)
if match:
raw = match.group(1)
setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
return result
def check_secondary_alarms(result: ExperimentResult) -> list[str]:
"""Check secondary metrics against fixed alarm thresholds.
Args:
result: Parsed experiment result.
Returns:
List of human-readable alarm strings (empty if all clear).
"""
alarms: list[str] = []
if result.mhc_spectral_norm > 2.0:
alarms.append(
f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
)
if 0 < result.engram_hit_rate < 0.1:
alarms.append(
f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
)
if 0 < result.mfu_percent < 10:
alarms.append(
f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
)
return alarms
def should_keep(
result: ExperimentResult,
best_bpb: float,
gates: dict | None = None,
) -> tuple[bool, str]:
"""Decide whether to keep or discard an experiment.
The primary criterion is strictly lower val_bpb than the current best.
Optional secondary gates (passed from HarnessConfig.secondary_metrics)
can reject an otherwise-improving result.
Args:
result: Parsed experiment result.
best_bpb: Current best val_bpb across all experiments.
gates: Optional dict mapping metric name to threshold dict with
``"max"`` or ``"min"`` keys, e.g.
``{"mhc_spectral_norm": {"max": 2.0}}``.
Returns:
Tuple of (keep: bool, reason: str).
"""
if result.crashed:
return False, "crash"
if result.val_bpb <= 0:
return False, "invalid val_bpb"
if result.val_bpb >= best_bpb:
return False, "discard"
# Secondary gate checks.
if gates:
gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
return (
False,
f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
)
gate_engram = gates.get("engram_hit_rate", {}).get("min")
if gate_engram is not None and result.engram_hit_rate < gate_engram:
return (
False,
f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
)
return True, "keep"
|