Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

File size: 5,353 Bytes

2c30a29

"""Eval agent: parse run.log and extract metrics from training runs."""
import re
from dataclasses import dataclass, field


@dataclass
class ExperimentResult:
    """Parsed result from a single experiment run.

    All float fields default to 0.0; integer fields default to 0.
    The ``crashed`` flag is set when the log indicates a failure or the
    log file is missing entirely.
    """

    # Primary metric
    val_bpb: float = 0.0

    # Timing
    training_seconds: float = 0.0
    total_seconds: float = 0.0

    # Hardware
    peak_vram_mb: float = 0.0
    mfu_percent: float = 0.0

    # Throughput
    total_tokens_m: float = 0.0
    num_steps: int = 0

    # Model shape (echoed by train.py summary block)
    num_params_m: float = 0.0
    n_layer: int = 0
    d_model: int = 0

    # Secondary health metrics
    mhc_spectral_norm: float = 0.0
    engram_hit_rate: float = 0.0
    sr_bypass_rate: float = 0.0

    # Status
    crashed: bool = False
    error_message: str = ""


# Regex patterns keyed by ExperimentResult attribute name.
# Format must match the ``--- Summary ---`` block printed by train.py.
_PATTERNS: dict[str, str] = {
    "val_bpb": r"^val_bpb:\s+([\d.]+)",
    "training_seconds": r"^training_seconds:\s+([\d.]+)",
    "total_seconds": r"^total_seconds:\s+([\d.]+)",
    "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
    "mfu_percent": r"^mfu_percent:\s+([\d.]+)",
    "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
    "num_steps": r"^num_steps:\s+(\d+)",
    "num_params_m": r"^num_params_M:\s+([\d.]+)",
    "n_layer": r"^n_layer:\s+(\d+)",
    "d_model": r"^d_model:\s+(\d+)",
    "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
    "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
    "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
}

# Attributes that should be parsed as int rather than float.
_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})


def parse_run_log(log_path: str) -> ExperimentResult:
    """Parse a run.log file and extract all training metrics.

    Args:
        log_path: Absolute path to the run.log file.

    Returns:
        Populated ExperimentResult; sets ``crashed=True`` when the log
        contains a traceback or the file is missing.
    """
    result = ExperimentResult()

    try:
        with open(log_path) as fh:
            content = fh.read()
    except FileNotFoundError:
        result.crashed = True
        result.error_message = f"Log file not found: {log_path}"
        return result

    # Detect crash signals in output.
    if "Traceback" in content or "FAIL" in content or "Error" in content:
        result.crashed = True
        lines = content.strip().splitlines()
        result.error_message = "\n".join(lines[-20:])

    for attr, pattern in _PATTERNS.items():
        match = re.search(pattern, content, re.MULTILINE)
        if match:
            raw = match.group(1)
            setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))

    return result


def check_secondary_alarms(result: ExperimentResult) -> list[str]:
    """Check secondary metrics against fixed alarm thresholds.

    Args:
        result: Parsed experiment result.

    Returns:
        List of human-readable alarm strings (empty if all clear).
    """
    alarms: list[str] = []

    if result.mhc_spectral_norm > 2.0:
        alarms.append(
            f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
        )
    if 0 < result.engram_hit_rate < 0.1:
        alarms.append(
            f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
        )
    if 0 < result.mfu_percent < 10:
        alarms.append(
            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
        )

    return alarms


def should_keep(
    result: ExperimentResult,
    best_bpb: float,
    gates: dict | None = None,
) -> tuple[bool, str]:
    """Decide whether to keep or discard an experiment.

    The primary criterion is strictly lower val_bpb than the current best.
    Optional secondary gates (passed from HarnessConfig.secondary_metrics)
    can reject an otherwise-improving result.

    Args:
        result: Parsed experiment result.
        best_bpb: Current best val_bpb across all experiments.
        gates: Optional dict mapping metric name to threshold dict with
               ``"max"`` or ``"min"`` keys, e.g.
               ``{"mhc_spectral_norm": {"max": 2.0}}``.

    Returns:
        Tuple of (keep: bool, reason: str).
    """
    if result.crashed:
        return False, "crash"
    if result.val_bpb <= 0:
        return False, "invalid val_bpb"
    if result.val_bpb >= best_bpb:
        return False, "discard"

    # Secondary gate checks.
    if gates:
        gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
        if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
            return (
                False,
                f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
            )
        gate_engram = gates.get("engram_hit_rate", {}).get("min")
        if gate_engram is not None and result.engram_hit_rate < gate_engram:
            return (
                False,
                f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
            )

    return True, "keep"