| """Eval agent: parse run.log and extract metrics from training runs.""" |
| import re |
| from dataclasses import dataclass, field |
|
|
|
|
| @dataclass |
| class ExperimentResult: |
| """Parsed result from a single experiment run. |
| |
| All float fields default to 0.0; integer fields default to 0. |
| The ``crashed`` flag is set when the log indicates a failure or the |
| log file is missing entirely. |
| """ |
|
|
| |
| val_bpb: float = 0.0 |
|
|
| |
| training_seconds: float = 0.0 |
| total_seconds: float = 0.0 |
|
|
| |
| peak_vram_mb: float = 0.0 |
| mfu_percent: float = 0.0 |
|
|
| |
| total_tokens_m: float = 0.0 |
| num_steps: int = 0 |
|
|
| |
| num_params_m: float = 0.0 |
| n_layer: int = 0 |
| d_model: int = 0 |
|
|
| |
| mhc_spectral_norm: float = 0.0 |
| engram_hit_rate: float = 0.0 |
| sr_bypass_rate: float = 0.0 |
|
|
| |
| crashed: bool = False |
| error_message: str = "" |
|
|
|
|
| |
| |
| _PATTERNS: dict[str, str] = { |
| "val_bpb": r"^val_bpb:\s+([\d.]+)", |
| "training_seconds": r"^training_seconds:\s+([\d.]+)", |
| "total_seconds": r"^total_seconds:\s+([\d.]+)", |
| "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)", |
| "mfu_percent": r"^mfu_percent:\s+([\d.]+)", |
| "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)", |
| "num_steps": r"^num_steps:\s+(\d+)", |
| "num_params_m": r"^num_params_M:\s+([\d.]+)", |
| "n_layer": r"^n_layer:\s+(\d+)", |
| "d_model": r"^d_model:\s+(\d+)", |
| "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)", |
| "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)", |
| "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)", |
| } |
|
|
| |
| _INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"}) |
|
|
|
|
| def parse_run_log(log_path: str) -> ExperimentResult: |
| """Parse a run.log file and extract all training metrics. |
| |
| Args: |
| log_path: Absolute path to the run.log file. |
| |
| Returns: |
| Populated ExperimentResult; sets ``crashed=True`` when the log |
| contains a traceback or the file is missing. |
| """ |
| result = ExperimentResult() |
|
|
| try: |
| with open(log_path) as fh: |
| content = fh.read() |
| except FileNotFoundError: |
| result.crashed = True |
| result.error_message = f"Log file not found: {log_path}" |
| return result |
|
|
| |
| if "Traceback" in content or "FAIL" in content or "Error" in content: |
| result.crashed = True |
| lines = content.strip().splitlines() |
| result.error_message = "\n".join(lines[-20:]) |
|
|
| for attr, pattern in _PATTERNS.items(): |
| match = re.search(pattern, content, re.MULTILINE) |
| if match: |
| raw = match.group(1) |
| setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw)) |
|
|
| return result |
|
|
|
|
| def check_secondary_alarms(result: ExperimentResult) -> list[str]: |
| """Check secondary metrics against fixed alarm thresholds. |
| |
| Args: |
| result: Parsed experiment result. |
| |
| Returns: |
| List of human-readable alarm strings (empty if all clear). |
| """ |
| alarms: list[str] = [] |
|
|
| if result.mhc_spectral_norm > 2.0: |
| alarms.append( |
| f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)" |
| ) |
| if 0 < result.engram_hit_rate < 0.1: |
| alarms.append( |
| f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)" |
| ) |
| if 0 < result.mfu_percent < 10: |
| alarms.append( |
| f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)" |
| ) |
|
|
| return alarms |
|
|
|
|
| def should_keep( |
| result: ExperimentResult, |
| best_bpb: float, |
| gates: dict | None = None, |
| ) -> tuple[bool, str]: |
| """Decide whether to keep or discard an experiment. |
| |
| The primary criterion is strictly lower val_bpb than the current best. |
| Optional secondary gates (passed from HarnessConfig.secondary_metrics) |
| can reject an otherwise-improving result. |
| |
| Args: |
| result: Parsed experiment result. |
| best_bpb: Current best val_bpb across all experiments. |
| gates: Optional dict mapping metric name to threshold dict with |
| ``"max"`` or ``"min"`` keys, e.g. |
| ``{"mhc_spectral_norm": {"max": 2.0}}``. |
| |
| Returns: |
| Tuple of (keep: bool, reason: str). |
| """ |
| if result.crashed: |
| return False, "crash" |
| if result.val_bpb <= 0: |
| return False, "invalid val_bpb" |
| if result.val_bpb >= best_bpb: |
| return False, "discard" |
|
|
| |
| if gates: |
| gate_mhc = gates.get("mhc_spectral_norm", {}).get("max") |
| if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc: |
| return ( |
| False, |
| f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}", |
| ) |
| gate_engram = gates.get("engram_hit_rate", {}).get("min") |
| if gate_engram is not None and result.engram_hit_rate < gate_engram: |
| return ( |
| False, |
| f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}", |
| ) |
|
|
| return True, "keep" |
|
|