File size: 5,353 Bytes
2c30a29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""Eval agent: parse run.log and extract metrics from training runs."""
import re
from dataclasses import dataclass, field


@dataclass
class ExperimentResult:
    """Parsed result from a single experiment run.

    All float fields default to 0.0; integer fields default to 0.
    The ``crashed`` flag is set when the log indicates a failure or the
    log file is missing entirely.
    """

    # Primary metric
    val_bpb: float = 0.0

    # Timing
    training_seconds: float = 0.0
    total_seconds: float = 0.0

    # Hardware
    peak_vram_mb: float = 0.0
    mfu_percent: float = 0.0

    # Throughput
    total_tokens_m: float = 0.0
    num_steps: int = 0

    # Model shape (echoed by train.py summary block)
    num_params_m: float = 0.0
    n_layer: int = 0
    d_model: int = 0

    # Secondary health metrics
    mhc_spectral_norm: float = 0.0
    engram_hit_rate: float = 0.0
    sr_bypass_rate: float = 0.0

    # Status
    crashed: bool = False
    error_message: str = ""


# Regex patterns keyed by ExperimentResult attribute name.
# Format must match the ``--- Summary ---`` block printed by train.py.
_PATTERNS: dict[str, str] = {
    "val_bpb": r"^val_bpb:\s+([\d.]+)",
    "training_seconds": r"^training_seconds:\s+([\d.]+)",
    "total_seconds": r"^total_seconds:\s+([\d.]+)",
    "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
    "mfu_percent": r"^mfu_percent:\s+([\d.]+)",
    "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
    "num_steps": r"^num_steps:\s+(\d+)",
    "num_params_m": r"^num_params_M:\s+([\d.]+)",
    "n_layer": r"^n_layer:\s+(\d+)",
    "d_model": r"^d_model:\s+(\d+)",
    "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
    "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
    "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
}

# Attributes that should be parsed as int rather than float.
_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})


def parse_run_log(log_path: str) -> ExperimentResult:
    """Parse a run.log file and extract all training metrics.

    Args:
        log_path: Absolute path to the run.log file.

    Returns:
        Populated ExperimentResult; sets ``crashed=True`` when the log
        contains a traceback or the file is missing.
    """
    result = ExperimentResult()

    try:
        with open(log_path) as fh:
            content = fh.read()
    except FileNotFoundError:
        result.crashed = True
        result.error_message = f"Log file not found: {log_path}"
        return result

    # Detect crash signals in output.
    if "Traceback" in content or "FAIL" in content or "Error" in content:
        result.crashed = True
        lines = content.strip().splitlines()
        result.error_message = "\n".join(lines[-20:])

    for attr, pattern in _PATTERNS.items():
        match = re.search(pattern, content, re.MULTILINE)
        if match:
            raw = match.group(1)
            setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))

    return result


def check_secondary_alarms(result: ExperimentResult) -> list[str]:
    """Check secondary metrics against fixed alarm thresholds.

    Args:
        result: Parsed experiment result.

    Returns:
        List of human-readable alarm strings (empty if all clear).
    """
    alarms: list[str] = []

    if result.mhc_spectral_norm > 2.0:
        alarms.append(
            f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
        )
    if 0 < result.engram_hit_rate < 0.1:
        alarms.append(
            f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
        )
    if 0 < result.mfu_percent < 10:
        alarms.append(
            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
        )

    return alarms


def should_keep(
    result: ExperimentResult,
    best_bpb: float,
    gates: dict | None = None,
) -> tuple[bool, str]:
    """Decide whether to keep or discard an experiment.

    The primary criterion is strictly lower val_bpb than the current best.
    Optional secondary gates (passed from HarnessConfig.secondary_metrics)
    can reject an otherwise-improving result.

    Args:
        result: Parsed experiment result.
        best_bpb: Current best val_bpb across all experiments.
        gates: Optional dict mapping metric name to threshold dict with
               ``"max"`` or ``"min"`` keys, e.g.
               ``{"mhc_spectral_norm": {"max": 2.0}}``.

    Returns:
        Tuple of (keep: bool, reason: str).
    """
    if result.crashed:
        return False, "crash"
    if result.val_bpb <= 0:
        return False, "invalid val_bpb"
    if result.val_bpb >= best_bpb:
        return False, "discard"

    # Secondary gate checks.
    if gates:
        gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
        if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
            return (
                False,
                f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
            )
        gate_engram = gates.get("engram_hit_rate", {}).get("min")
        if gate_engram is not None and result.engram_hit_rate < gate_engram:
            return (
                False,
                f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
            )

    return True, "keep"