"""Eval agent: parse run.log and extract metrics from training runs.""" import re from dataclasses import dataclass, field @dataclass class ExperimentResult: """Parsed result from a single experiment run. All float fields default to 0.0; integer fields default to 0. The ``crashed`` flag is set when the log indicates a failure or the log file is missing entirely. """ # Primary metric val_bpb: float = 0.0 # Timing training_seconds: float = 0.0 total_seconds: float = 0.0 # Hardware peak_vram_mb: float = 0.0 mfu_percent: float = 0.0 # Throughput total_tokens_m: float = 0.0 num_steps: int = 0 # Model shape (echoed by train.py summary block) num_params_m: float = 0.0 n_layer: int = 0 d_model: int = 0 # Secondary health metrics mhc_spectral_norm: float = 0.0 engram_hit_rate: float = 0.0 sr_bypass_rate: float = 0.0 # Status crashed: bool = False error_message: str = "" # Regex patterns keyed by ExperimentResult attribute name. # Format must match the ``--- Summary ---`` block printed by train.py. _PATTERNS: dict[str, str] = { "val_bpb": r"^val_bpb:\s+([\d.]+)", "training_seconds": r"^training_seconds:\s+([\d.]+)", "total_seconds": r"^total_seconds:\s+([\d.]+)", "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)", "mfu_percent": r"^mfu_percent:\s+([\d.]+)", "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)", "num_steps": r"^num_steps:\s+(\d+)", "num_params_m": r"^num_params_M:\s+([\d.]+)", "n_layer": r"^n_layer:\s+(\d+)", "d_model": r"^d_model:\s+(\d+)", "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)", "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)", "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)", } # Attributes that should be parsed as int rather than float. _INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"}) def parse_run_log(log_path: str) -> ExperimentResult: """Parse a run.log file and extract all training metrics. Args: log_path: Absolute path to the run.log file. Returns: Populated ExperimentResult; sets ``crashed=True`` when the log contains a traceback or the file is missing. """ result = ExperimentResult() try: with open(log_path) as fh: content = fh.read() except FileNotFoundError: result.crashed = True result.error_message = f"Log file not found: {log_path}" return result # Detect crash signals in output. if "Traceback" in content or "FAIL" in content or "Error" in content: result.crashed = True lines = content.strip().splitlines() result.error_message = "\n".join(lines[-20:]) for attr, pattern in _PATTERNS.items(): match = re.search(pattern, content, re.MULTILINE) if match: raw = match.group(1) setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw)) return result def check_secondary_alarms(result: ExperimentResult) -> list[str]: """Check secondary metrics against fixed alarm thresholds. Args: result: Parsed experiment result. Returns: List of human-readable alarm strings (empty if all clear). """ alarms: list[str] = [] if result.mhc_spectral_norm > 2.0: alarms.append( f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)" ) if 0 < result.engram_hit_rate < 0.1: alarms.append( f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)" ) if 0 < result.mfu_percent < 10: alarms.append( f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)" ) return alarms def should_keep( result: ExperimentResult, best_bpb: float, gates: dict | None = None, ) -> tuple[bool, str]: """Decide whether to keep or discard an experiment. The primary criterion is strictly lower val_bpb than the current best. Optional secondary gates (passed from HarnessConfig.secondary_metrics) can reject an otherwise-improving result. Args: result: Parsed experiment result. best_bpb: Current best val_bpb across all experiments. gates: Optional dict mapping metric name to threshold dict with ``"max"`` or ``"min"`` keys, e.g. ``{"mhc_spectral_norm": {"max": 2.0}}``. Returns: Tuple of (keep: bool, reason: str). """ if result.crashed: return False, "crash" if result.val_bpb <= 0: return False, "invalid val_bpb" if result.val_bpb >= best_bpb: return False, "discard" # Secondary gate checks. if gates: gate_mhc = gates.get("mhc_spectral_norm", {}).get("max") if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc: return ( False, f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}", ) gate_engram = gates.get("engram_hit_rate", {}).get("min") if gate_engram is not None and result.engram_hit_rate < gate_engram: return ( False, f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}", ) return True, "keep"