Spaces:
Runtime error
Runtime error
| """Eval agent: parse run.log and extract metrics from training runs.""" | |
| import re | |
| from dataclasses import dataclass, field | |
| class ExperimentResult: | |
| """Parsed result from a single experiment run. | |
| All float fields default to 0.0; integer fields default to 0. | |
| The ``crashed`` flag is set when the log indicates a failure or the | |
| log file is missing entirely. | |
| """ | |
| # Primary metric | |
| val_bpb: float = 0.0 | |
| # Timing | |
| training_seconds: float = 0.0 | |
| total_seconds: float = 0.0 | |
| # Hardware | |
| peak_vram_mb: float = 0.0 | |
| mfu_percent: float = 0.0 | |
| # Throughput | |
| total_tokens_m: float = 0.0 | |
| num_steps: int = 0 | |
| # Model shape (echoed by train.py summary block) | |
| num_params_m: float = 0.0 | |
| n_layer: int = 0 | |
| d_model: int = 0 | |
| # Secondary health metrics | |
| mhc_spectral_norm: float = 0.0 | |
| engram_hit_rate: float = 0.0 | |
| sr_bypass_rate: float = 0.0 | |
| # Status | |
| crashed: bool = False | |
| error_message: str = "" | |
| # Regex patterns keyed by ExperimentResult attribute name. | |
| # Format must match the ``--- Summary ---`` block printed by train.py. | |
| _PATTERNS: dict[str, str] = { | |
| "val_bpb": r"^val_bpb:\s+([\d.]+)", | |
| "training_seconds": r"^training_seconds:\s+([\d.]+)", | |
| "total_seconds": r"^total_seconds:\s+([\d.]+)", | |
| "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)", | |
| "mfu_percent": r"^mfu_percent:\s+([\d.]+)", | |
| "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)", | |
| "num_steps": r"^num_steps:\s+(\d+)", | |
| "num_params_m": r"^num_params_M:\s+([\d.]+)", | |
| "n_layer": r"^n_layer:\s+(\d+)", | |
| "d_model": r"^d_model:\s+(\d+)", | |
| "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)", | |
| "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)", | |
| "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)", | |
| } | |
| # Attributes that should be parsed as int rather than float. | |
| _INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"}) | |
| def parse_run_log(log_path: str) -> ExperimentResult: | |
| """Parse a run.log file and extract all training metrics. | |
| Args: | |
| log_path: Absolute path to the run.log file. | |
| Returns: | |
| Populated ExperimentResult; sets ``crashed=True`` when the log | |
| contains a traceback or the file is missing. | |
| """ | |
| result = ExperimentResult() | |
| try: | |
| with open(log_path) as fh: | |
| content = fh.read() | |
| except FileNotFoundError: | |
| result.crashed = True | |
| result.error_message = f"Log file not found: {log_path}" | |
| return result | |
| # Detect crash signals in output. | |
| if "Traceback" in content or "FAIL" in content or "Error" in content: | |
| result.crashed = True | |
| lines = content.strip().splitlines() | |
| result.error_message = "\n".join(lines[-20:]) | |
| for attr, pattern in _PATTERNS.items(): | |
| match = re.search(pattern, content, re.MULTILINE) | |
| if match: | |
| raw = match.group(1) | |
| setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw)) | |
| return result | |
| def check_secondary_alarms(result: ExperimentResult) -> list[str]: | |
| """Check secondary metrics against fixed alarm thresholds. | |
| Args: | |
| result: Parsed experiment result. | |
| Returns: | |
| List of human-readable alarm strings (empty if all clear). | |
| """ | |
| alarms: list[str] = [] | |
| if result.mhc_spectral_norm > 2.0: | |
| alarms.append( | |
| f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)" | |
| ) | |
| if 0 < result.engram_hit_rate < 0.1: | |
| alarms.append( | |
| f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)" | |
| ) | |
| if 0 < result.mfu_percent < 10: | |
| alarms.append( | |
| f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)" | |
| ) | |
| return alarms | |
| def should_keep( | |
| result: ExperimentResult, | |
| best_bpb: float, | |
| gates: dict | None = None, | |
| ) -> tuple[bool, str]: | |
| """Decide whether to keep or discard an experiment. | |
| The primary criterion is strictly lower val_bpb than the current best. | |
| Optional secondary gates (passed from HarnessConfig.secondary_metrics) | |
| can reject an otherwise-improving result. | |
| Args: | |
| result: Parsed experiment result. | |
| best_bpb: Current best val_bpb across all experiments. | |
| gates: Optional dict mapping metric name to threshold dict with | |
| ``"max"`` or ``"min"`` keys, e.g. | |
| ``{"mhc_spectral_norm": {"max": 2.0}}``. | |
| Returns: | |
| Tuple of (keep: bool, reason: str). | |
| """ | |
| if result.crashed: | |
| return False, "crash" | |
| if result.val_bpb <= 0: | |
| return False, "invalid val_bpb" | |
| if result.val_bpb >= best_bpb: | |
| return False, "discard" | |
| # Secondary gate checks. | |
| if gates: | |
| gate_mhc = gates.get("mhc_spectral_norm", {}).get("max") | |
| if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc: | |
| return ( | |
| False, | |
| f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}", | |
| ) | |
| gate_engram = gates.get("engram_hit_rate", {}).get("min") | |
| if gate_engram is not None and result.engram_hit_rate < gate_engram: | |
| return ( | |
| False, | |
| f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}", | |
| ) | |
| return True, "keep" | |