File size: 17,381 Bytes
22741d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/env python3
"""Continuous Feather autoresearch loop for local RTX 3060.

Protocol:
- One GPU owner, sequential runs only.
- 300s training budget, redirected logs.
- Parse val_bpb / metrics JSON from disk.
- Append TSV ledger.
- Keep searching until hard gate is reached or process is killed.

This loop mutates runtime env first because current Feather exposes most active
architecture/optimizer knobs through HYDRA_* gates. Code edits can be added as
candidate generators after the env frontier is exhausted.
"""
from __future__ import annotations

import itertools
import json
import os
import re
import shlex
import subprocess
import time
from pathlib import Path

ROOT = Path('/home/mikeb/work/feather')
LOGDIR = ROOT / 'logs' / 'autoresearch_may03'
LEDGER = ROOT / 'autoresearch_may03_results.tsv'
TARGET_BPB = float(os.environ.get('AUTORESEARCH_TARGET_BPB', '1.60'))
# Strict autoresearch cadence: train.py gets HYDRA_TIME_BUDGET=300; wrapper only
# allows startup + final eval overhead. Do not let one candidate occupy the GPU
# for 10-12 minutes unless it is genuinely hung.
RUN_TIMEOUT = int(os.environ.get('AUTORESEARCH_RUN_TIMEOUT', '430'))

LOGDIR.mkdir(parents=True, exist_ok=True)
if not LEDGER.exists():
    LEDGER.write_text('ts\tcommit\tcandidate\tval_bpb\tpeak_tps\tmedian_tps\tmemory_gb\tstatus\tdescription\tlog\n')

BASE = {
    'LD_LIBRARY_PATH': '/usr/lib/wsl/lib:/usr/local/cuda/lib64',
    'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True',
    'HF_TOKEN': '',
    'HUGGINGFACE_HUB_TOKEN': '',
    'WANDB_DISABLED': 'true',
    'HYDRA_USE_NEMOTRON': '1',
    'HYDRA_USE_FULL_BLEND': '1',
    'HYDRA_SAMPLED_SOFTMAX': '1024',
    'HYDRA_SOFTCAP_CLAMP': '1',
    'HYDRA_SEQ_LEN': '1024',
    'HYDRA_HEADDIM': '32',
    'HYDRA_EXPAND': '3',
    'HYDRA_BATCH_SIZE': '8',
    'HYDRA_TOTAL_BATCH': '16384',
    'HYDRA_D_MODEL': '160',
    'HYDRA_N_LAYER': '20',
    'HYDRA_D_STATE': '64',
    'HYDRA_TIME_BUDGET': '300',
    'HYDRA_ENGRAM_N_COLUMNS': '16384',
    'HYDRA_ENGRAM_TOPK': '64',
    'HYDRA_GDN_LAYERS': '',
    'HYDRA_MTP_K': '1',
    'HYDRA_USE_MDLM': '0',
    'HYDRA_MUON_COMPILE': '0',
    'HYDRA_MUON_NS_STEPS': '2',  # promoted from TPS-11 receipt
    'HYDRA_MATRIX_LR': '0.04',
    'HYDRA_EMBED_LR': '0.6',
    'HYDRA_UNEMBED_LR': '0.004',
    'HYDRA_DT_BIAS_LR': '0.6',
    'HYDRA_LOCAL_SHARDS_ONLY': '1',
    'HYDRA_BACKGROUND_PREFETCH': '0',
    'HYDRA_STREAM_SHUFFLE_BUFFER': '256',
    'HYDRA_STREAM_PREFETCH': '16',
    'HYDRA_TOKEN_PREFETCH': '4',
    'HYDRA_TOKEN_CACHE_GB': '1',
    'HYDRA_CKPT_INTERVAL': '2000',
    'HYDRA_MID_VAL_INTERVAL': '0',
    'HYDRA_HTM_SUBSAMPLE': '128',
    'HYDRA_EVAL_BATCH': '1',
    # HYDRA_EVAL_TOKENS removed (audit 2026-05-09, issue #15): the previous
    # 1024-token eval reduced "20% factual" to a coin flip — every digit of
    # quality signal we logged was within sampling noise. Defer to the
    # prepare.EVAL_TOKENS default (~21M) or the 5M floor in eval_quality.py.
    'HYDRA_CE_CHUNK': '32',
    'HYDRA_SKIP_FACTUAL_EVAL': '1',
    'HYDRA_RESUME_CKPT': 'none',
    'UV_PYTHON': '/usr/bin/python3',
}

# Ordered from lowest-risk/promising to wider/radical. Infinite outer loop will
# revisit with perturbations after first pass.
CANDIDATES: list[tuple[str, dict[str, str], str]] = [
    # Plateau-escape candidates: stronger than tiny LR nudges. These attack
    # the 5-minute validation plateau by changing effective optimization,
    # temporal capacity, and memory pressure while keeping full architecture.
    # Real z-loss axis was tested after wiring fix: z=0.001 regressed
    # (2.0446 vs best 2.0237). Return to default z=1e-4 and mutate the
    # discovered l16/d192 basin more aggressively.
    ('basin_l16d192_lr085_emb11', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.085','HYDRA_EMBED_LR':'1.1'}, 'basin: l16d192 hotter LR default z'),
    ('basin_l16d192_lr10_emb13', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.10','HYDRA_EMBED_LR':'1.3'}, 'basin: l16d192 max hot LR default z'),
    ('basin_l16d192_lr065_emb09', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.065','HYDRA_EMBED_LR':'0.9'}, 'basin: l16d192 moderate LR default z'),
    ('basin_l16d192_ns1p5_nope_ns2_fasttb', {'HYDRA_TOTAL_BATCH':'24576','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 TB24576 more updates default z'),
    ('basin_l16d192_dstate48', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_D_STATE':'48','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 smaller d_state faster updates'),
    ('basin_l16d192_dstate80', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_D_STATE':'80','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 d_state80 capacity'),
    ('basin_l18d160_hot_defaultz', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'18','HYDRA_D_MODEL':'160','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: valid deeper l18d160 default z'),
    # High-leverage evolutionary front around the discovered winner l16/d192.
    # This is no longer tiny-knob search: change shape + optimizer together.
    ('evo_l16d192_lr075_10', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'evo: l16d192 with hotter LR for 300s descent'),
    ('evo_l16d192_lr05_07', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.05','HYDRA_EMBED_LR':'0.7'}, 'evo: l16d192 slightly cooler stability'),
    ('evo_l16d208', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'208','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16 wider d208'),
    ('evo_l14d224', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'14','HYDRA_D_MODEL':'224','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l14 d224 speed/capacity trade'),
    ('evo_l12d256', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'12','HYDRA_D_MODEL':'256','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l12 d256 wide-frontier probe'),
    ('evo_l10d288', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'10','HYDRA_D_MODEL':'288','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l10 d288 radical width probe'),
    ('evo_l16d192_k768', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_SAMPLED_SOFTMAX':'768','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 lower sampled softmax for more updates'),
    ('evo_l16d192_k512', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_SAMPLED_SOFTMAX':'512','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 K512 throughput/calibration probe'),
    ('evo_l16d192_tb16384', {'HYDRA_TOTAL_BATCH':'16384','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 smaller TB more optimizer steps'),
    ('escape_tb32768_z001_ns2_lr_hi', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'plateau escape: faster 300s descent with champion TB/zloss'),
    ('escape_tb32768_z001_ns2_lr_lo', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.025','HYDRA_EMBED_LR':'0.45'}, 'plateau escape: lower LR calibration'),
    ('escape_tb32768_ns2_dstate96', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_D_STATE':'96'}, 'plateau escape: extra SSM state capacity'),
    ('escape_tb32768_ns2_l18_d176', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'18','HYDRA_D_MODEL':'176'}, 'plateau escape: trade depth for width at similar budget'),
    ('escape_tb32768_ns2_l16_d192', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192'}, 'plateau escape: stronger width trade'),
    ('escape_tb32768_ns2_gdn3', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_GDN_LAYERS':'3,7,11'}, 'plateau escape: reintroduce known GDN quality axis'),
    ('escape_tb32768_ns2_gdn5', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_GDN_LAYERS':'0,4,8,12,16'}, 'plateau escape: distributed 5-GDN quality axis'),
    ('escape_tb32768_ns2_enk128', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_ENGRAM_TOPK':'128'}, 'plateau escape: wider engram read'),
    ('escape_tb32768_ns2_dr64', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_SDR_DELTA_RANK':'64'}, 'plateau escape: wider SDR STE pipe despite prior weak amp'),
    ('escape_tb32768_ns3_lr_hi', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'plateau escape: stable NS3 plus faster LR'),
    ('ns2_lr_m003', {'HYDRA_MATRIX_LR':'0.03'}, 'slightly lower matrix LR stabilizer'),
    ('ns2_lr_m005', {'HYDRA_MATRIX_LR':'0.05'}, 'slightly higher matrix LR for faster 300s descent'),
    ('ns2_embed04', {'HYDRA_EMBED_LR':'0.4'}, 'lower embed LR calibration'),
    ('ns2_embed08', {'HYDRA_EMBED_LR':'0.8'}, 'higher embed LR fast lexical fit'),
    ('ns2_dt03', {'HYDRA_DT_BIAS_LR':'0.3'}, 'lower dt-bias LR stability'),
    ('ns2_dt10', {'HYDRA_DT_BIAS_LR':'1.0'}, 'higher dt-bias adaptation'),
    ('ns2_dstate96', {'HYDRA_D_STATE':'96'}, 'more SSM state capacity'),
    ('ns2_dstate128', {'HYDRA_D_STATE':'128'}, 'max SSM state capacity probe'),
    ('ns2_enk128', {'HYDRA_ENGRAM_TOPK':'128'}, 'wider engram retrieval'),
    ('ns2_enk32', {'HYDRA_ENGRAM_TOPK':'32'}, 'narrower engram retrieval / less noise'),
    ('ns2_htm64', {'HYDRA_HTM_SUBSAMPLE':'64'}, 'more frequent HTM update'),
    ('ns2_htm256', {'HYDRA_HTM_SUBSAMPLE':'256'}, 'less HTM overhead/noise'),
    ('ns2_gdn_3_7_11', {'HYDRA_GDN_LAYERS':'3,7,11'}, 'retest 3-GDN trend on NS2'),
    ('ns2_gdn_0_4_8_12_16', {'HYDRA_GDN_LAYERS':'0,4,8,12,16'}, '5-GDN distributed depth'),
    ('ns2_gdn_0_1_2', {'HYDRA_GDN_LAYERS':'0,1,2'}, 'early GDN locality'),
    ('ns2_l18', {'HYDRA_N_LAYER':'18'}, 'shallower depth for more updates in budget'),
    ('ns2_l22', {'HYDRA_N_LAYER':'22'}, 'deeper temporal hierarchy if fits'),
    ('ns2_d176', {'HYDRA_D_MODEL':'176'}, 'slightly wider model'),
    ('ns2_d192', {'HYDRA_D_MODEL':'192'}, 'wider model capacity probe'),
    ('ns3_gdn_3_7_11', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_GDN_LAYERS':'3,7,11'}, 'known GDN axis with stable Muon NS3'),
    ('ns3_tb32768_z001', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001'}, 'champion-ish optimizer defaults'),
]

STEP_RE = re.compile(r'^step=\d+ .*?bpb=([0-9.]+).*?tps=([0-9.]+)', re.M)
VAL_RE = re.compile(r'val_bpb:\s*([0-9.]+)')
METRICS_RE = re.compile(r'\[METRICS_JSON\]\s*(\{.*\})')


def current_commit() -> str:
    return subprocess.check_output(['git','rev-parse','--short','HEAD'], cwd=ROOT, text=True).strip()


def completed_names() -> set[str]:
    done: set[str] = set()
    if not LEDGER.exists():
        return done
    for line in LEDGER.read_text(errors='ignore').splitlines()[1:]:
        parts = line.split('\t')
        if len(parts) >= 3:
            done.add(parts[2])
    return done


def best_seen() -> float:
    best = 999.0
    # Parse the TSV ledger first. Its rows are not `val_bpb:` log lines.
    if LEDGER.exists():
        for line in LEDGER.read_text(errors='ignore').splitlines()[1:]:
            parts = line.split('\t')
            if len(parts) >= 4:
                try:
                    v = float(parts[3])
                except ValueError:
                    continue
                if v > 0:
                    best = min(best, v)
    # Also seed from known one-off receipts.
    for path in [ROOT/'run_tps11_ns2.log', ROOT/'run_tps7_bs10.log', ROOT/'run_tps1_htm256.log']:
        if not path.exists():
            continue
        txt = path.read_text(errors='ignore')
        for m in VAL_RE.finditer(txt):
            best = min(best, float(m.group(1)))
    return best


def parse_log(path: Path):
    txt = path.read_text(errors='ignore') if path.exists() else ''
    vals = [float(m.group(1)) for m in VAL_RE.finditer(txt)]
    pairs = [(float(a), float(b)) for a,b in STEP_RE.findall(txt)]
    tps = [b for _, b in pairs if b > 0]
    peak_tps = max(tps) if tps else 0.0
    med_tps = sorted(tps)[len(tps)//2] if tps else 0.0
    mem_gb = 0.0
    metrics = None
    mm = list(METRICS_RE.finditer(txt))
    if mm:
        try:
            metrics = json.loads(mm[-1].group(1))
            mem_gb = float(metrics.get('peak_vram_mb', 0.0)) / 1024.0
        except Exception:
            pass
    if vals:
        return vals[-1], peak_tps, med_tps, mem_gb, 'ok', metrics
    if 'out of memory' in txt.lower() or 'OutOfMemory' in txt or 'CUDA driver error: out of memory' in txt:
        return 0.0, peak_tps, med_tps, mem_gb, 'crash_oom', metrics
    if 'Traceback' in txt or 'RuntimeError' in txt or 'AssertionError' in txt:
        return 0.0, peak_tps, med_tps, mem_gb, 'crash', metrics
    return 0.0, peak_tps, med_tps, mem_gb, 'no_val', metrics


def append(row: list[str]) -> None:
    with LEDGER.open('a') as f:
        f.write('\t'.join(row) + '\n')


def perturb_candidates(round_idx: int):
    # Deterministic widening after first pass: combine the best-known NS2 with
    # small LR/zloss/GDN/engram perturbations. Keeps generating work forever.
    lrs = ['0.025','0.03','0.035','0.04','0.045','0.05']
    embeds = ['0.45','0.55','0.6','0.7']
    zloss = ['0.0001','0.0005','0.001','0.002']
    gdns = ['', '3,7,11', '0,4,8,12,16', '0,1,2']
    for i, (mlr, elr, zl, gdn) in enumerate(itertools.product(lrs, embeds, zloss, gdns)):
        name = f'auto_r{round_idx:02d}_{i:03d}'
        yield name, {
            'HYDRA_MUON_NS_STEPS': '2',
            'HYDRA_MATRIX_LR': mlr,
            'HYDRA_EMBED_LR': elr,
            'HYDRA_Z_LOSS_WEIGHT': zl,
            'HYDRA_GDN_LAYERS': gdn,
        }, f'auto grid ns2 mlr={mlr} embed={elr} z={zl} gdn={gdn or "none"}'


def run_candidate(name: str, delta: dict[str, str], desc: str, best: float):
    ts = time.strftime('%Y%m%d_%H%M%S')
    log = LOGDIR / f'{ts}_{name}.log'
    env = os.environ.copy()
    env.update(BASE)
    env.update(delta)
    cmd = ['taskset','-c','0-15', './.venv/bin/python', '-u', 'train.py']
    print(f'[{time.strftime("%F %T")}] RUN {name} best={best:.6f} desc={desc}', flush=True)
    with log.open('w') as f:
        f.write(f'=== {name} ===\n')
        f.write(f'desc={desc}\n')
        f.write('env_delta=' + json.dumps(delta, sort_keys=True) + '\n')
        f.flush()
        try:
            rc = subprocess.run(cmd, cwd=ROOT, env=env, stdout=f, stderr=subprocess.STDOUT, timeout=RUN_TIMEOUT).returncode
        except subprocess.TimeoutExpired:
            rc = 124
            f.write('\n[TIMEOUT]\n')
    val, peak, med, mem, status0, metrics = parse_log(log)
    if status0 == 'ok':
        status = 'keep' if val < best else 'discard'
    else:
        status = status0
    append([
        time.strftime('%F_%T'), current_commit(), name, f'{val:.6f}', f'{peak:.0f}', f'{med:.0f}', f'{mem:.2f}', status, desc.replace('\t',' '), str(log)
    ])
    print(f'[{time.strftime("%F %T")}] DONE {name} val={val:.6f} peak={peak:.0f} med={med:.0f} mem={mem:.2f} status={status} log={log}', flush=True)
    return val if status == 'keep' else best, status


def main():
    best = best_seen()
    one_shot = os.environ.get('AUTORESEARCH_ONE_SHOT', '0') == '1'
    print(f'START autoresearch may03 best_seen={best:.6f} target={TARGET_BPB:.6f} one_shot={one_shot}', flush=True)
    round_idx = 0
    done = completed_names()
    while True:
        stream = CANDIDATES if round_idx == 0 else list(perturb_candidates(round_idx))
        for name, delta, desc in stream:
            if name in done:
                print(f'[{time.strftime("%F %T")}] SKIP {name} already ledgered', flush=True)
                continue
            best, status = run_candidate(name, delta, desc, best)
            done.add(name)
            if best <= TARGET_BPB:
                print(f'HARDGATE_REACHED best={best:.6f} target={TARGET_BPB:.6f}', flush=True)
                return
            # Let CUDA/WSL settle and reduce fragmentation.
            subprocess.run(['bash','-lc','python3 - <<"PY"\nimport torch\ntorch.cuda.empty_cache() if torch.cuda.is_available() else None\nPY'], cwd=ROOT, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            if one_shot:
                print(f'ONE_SHOT_DONE best={best:.6f}', flush=True)
                return
            time.sleep(10)
        round_idx += 1
        if one_shot:
            # No remaining unledgered candidates in the fixed queue; allow the
            # perturbation generator on the next cron tick instead of looping in
            # a long-lived process.
            print(f'ONE_SHOT_NO_FIXED_CANDIDATE best={best:.6f}', flush=True)
            return

if __name__ == '__main__':
    main()