#!/usr/bin/env python3 from __future__ import annotations """Continuous Feather autoresearch loop for local RTX 3060. Protocol: - One GPU owner, sequential runs only. - 300s training budget, redirected logs. - Parse val_bpb / metrics JSON from disk. - Append TSV ledger. - Keep searching until hard gate is reached or process is killed. This loop mutates runtime env first because current Feather exposes most active architecture/optimizer knobs through HYDRA_* gates. Code edits can be added as candidate generators after the env frontier is exhausted. """ import itertools import json import os import re import shlex import subprocess import time from pathlib import Path ROOT = Path('/home/mikeb/work/feather') LOGDIR = ROOT / 'logs' / 'autoresearch_may03' LEDGER = ROOT / 'autoresearch_may03_results.tsv' TARGET_BPB = float(os.environ.get('AUTORESEARCH_TARGET_BPB', '1.60')) # Strict autoresearch cadence: train.py gets HYDRA_TIME_BUDGET=300; wrapper only # allows startup + final eval overhead. Do not let one candidate occupy the GPU # for 10-12 minutes unless it is genuinely hung. RUN_TIMEOUT = int(os.environ.get('AUTORESEARCH_RUN_TIMEOUT', '430')) LOGDIR.mkdir(parents=True, exist_ok=True) if not LEDGER.exists(): LEDGER.write_text('ts\tcommit\tcandidate\tval_bpb\tpeak_tps\tmedian_tps\tmemory_gb\tstatus\tdescription\tlog\n') BASE = { 'LD_LIBRARY_PATH': '/usr/lib/wsl/lib:/usr/local/cuda/lib64', 'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True', 'HF_TOKEN': '', 'HUGGINGFACE_HUB_TOKEN': '', 'WANDB_DISABLED': 'true', 'HYDRA_USE_NEMOTRON': '1', 'HYDRA_USE_FULL_BLEND': '1', 'HYDRA_SAMPLED_SOFTMAX': '1024', 'HYDRA_SOFTCAP_CLAMP': '1', 'HYDRA_SEQ_LEN': '1024', 'HYDRA_HEADDIM': '32', 'HYDRA_EXPAND': '3', 'HYDRA_BATCH_SIZE': '8', 'HYDRA_TOTAL_BATCH': '16384', 'HYDRA_D_MODEL': '160', 'HYDRA_N_LAYER': '20', 'HYDRA_D_STATE': '64', 'HYDRA_TIME_BUDGET': '300', 'HYDRA_ENGRAM_N_COLUMNS': '16384', 'HYDRA_ENGRAM_TOPK': '64', 'HYDRA_GDN_LAYERS': '', 'HYDRA_MTP_K': '1', 'HYDRA_USE_MDLM': '0', 'HYDRA_MUON_COMPILE': '0', 'HYDRA_MUON_NS_STEPS': '2', # promoted from TPS-11 receipt 'HYDRA_MATRIX_LR': '0.04', 'HYDRA_EMBED_LR': '0.6', 'HYDRA_UNEMBED_LR': '0.004', 'HYDRA_DT_BIAS_LR': '0.6', 'HYDRA_LOCAL_SHARDS_ONLY': '1', 'HYDRA_BACKGROUND_PREFETCH': '0', 'HYDRA_STREAM_SHUFFLE_BUFFER': '256', 'HYDRA_STREAM_PREFETCH': '16', 'HYDRA_TOKEN_PREFETCH': '4', 'HYDRA_TOKEN_CACHE_GB': '1', 'HYDRA_CKPT_INTERVAL': '2000', 'HYDRA_MID_VAL_INTERVAL': '0', 'HYDRA_HESTIA_INTERVAL': '999999', 'HYDRA_HTM_SUBSAMPLE': '128', 'HYDRA_EVAL_BATCH': '1', 'HYDRA_EVAL_TOKENS': '1024', 'HYDRA_CE_CHUNK': '32', 'HYDRA_SKIP_FACTUAL_EVAL': '1', 'HYDRA_RESUME_CKPT': 'none', 'UV_PYTHON': '/usr/bin/python3', } # Ordered from lowest-risk/promising to wider/radical. Infinite outer loop will # revisit with perturbations after first pass. CANDIDATES: list[tuple[str, dict[str, str], str]] = [ # Plateau-escape candidates: stronger than tiny LR nudges. These attack # the 5-minute validation plateau by changing effective optimization, # temporal capacity, and memory pressure while keeping full architecture. # Real z-loss axis was tested after wiring fix: z=0.001 regressed # (2.0446 vs best 2.0237). Return to default z=1e-4 and mutate the # discovered l16/d192 basin more aggressively. ('basin_l16d192_lr085_emb11', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.085','HYDRA_EMBED_LR':'1.1'}, 'basin: l16d192 hotter LR default z'), ('basin_l16d192_lr10_emb13', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.10','HYDRA_EMBED_LR':'1.3'}, 'basin: l16d192 max hot LR default z'), ('basin_l16d192_lr065_emb09', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.065','HYDRA_EMBED_LR':'0.9'}, 'basin: l16d192 moderate LR default z'), ('basin_l16d192_ns1p5_nope_ns2_fasttb', {'HYDRA_TOTAL_BATCH':'24576','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 TB24576 more updates default z'), ('basin_l16d192_dstate48', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_D_STATE':'48','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 smaller d_state faster updates'), ('basin_l16d192_dstate80', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_D_STATE':'80','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 d_state80 capacity'), ('basin_l18d160_hot_defaultz', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'18','HYDRA_D_MODEL':'160','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: valid deeper l18d160 default z'), # High-leverage evolutionary front around the discovered winner l16/d192. # This is no longer tiny-knob search: change shape + optimizer together. ('evo_l16d192_lr075_10', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'evo: l16d192 with hotter LR for 300s descent'), ('evo_l16d192_lr05_07', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.05','HYDRA_EMBED_LR':'0.7'}, 'evo: l16d192 slightly cooler stability'), ('evo_l16d208', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'208','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16 wider d208'), ('evo_l14d224', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'14','HYDRA_D_MODEL':'224','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l14 d224 speed/capacity trade'), ('evo_l12d256', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'12','HYDRA_D_MODEL':'256','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l12 d256 wide-frontier probe'), ('evo_l10d288', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'10','HYDRA_D_MODEL':'288','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l10 d288 radical width probe'), ('evo_l16d192_k768', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_SAMPLED_SOFTMAX':'768','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 lower sampled softmax for more updates'), ('evo_l16d192_k512', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_SAMPLED_SOFTMAX':'512','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 K512 throughput/calibration probe'), ('evo_l16d192_tb16384', {'HYDRA_TOTAL_BATCH':'16384','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 smaller TB more optimizer steps'), ('escape_tb32768_z001_ns2_lr_hi', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'plateau escape: faster 300s descent with champion TB/zloss'), ('escape_tb32768_z001_ns2_lr_lo', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.025','HYDRA_EMBED_LR':'0.45'}, 'plateau escape: lower LR calibration'), ('escape_tb32768_ns2_dstate96', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_D_STATE':'96'}, 'plateau escape: extra SSM state capacity'), ('escape_tb32768_ns2_l18_d176', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'18','HYDRA_D_MODEL':'176'}, 'plateau escape: trade depth for width at similar budget'), ('escape_tb32768_ns2_l16_d192', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192'}, 'plateau escape: stronger width trade'), ('escape_tb32768_ns2_gdn3', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_GDN_LAYERS':'3,7,11'}, 'plateau escape: reintroduce known GDN quality axis'), ('escape_tb32768_ns2_gdn5', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_GDN_LAYERS':'0,4,8,12,16'}, 'plateau escape: distributed 5-GDN quality axis'), ('escape_tb32768_ns2_enk128', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_ENGRAM_TOPK':'128'}, 'plateau escape: wider engram read'), ('escape_tb32768_ns2_dr64', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_SDR_DELTA_RANK':'64'}, 'plateau escape: wider SDR STE pipe despite prior weak amp'), ('escape_tb32768_ns3_lr_hi', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'plateau escape: stable NS3 plus faster LR'), ('ns2_lr_m003', {'HYDRA_MATRIX_LR':'0.03'}, 'slightly lower matrix LR stabilizer'), ('ns2_lr_m005', {'HYDRA_MATRIX_LR':'0.05'}, 'slightly higher matrix LR for faster 300s descent'), ('ns2_embed04', {'HYDRA_EMBED_LR':'0.4'}, 'lower embed LR calibration'), ('ns2_embed08', {'HYDRA_EMBED_LR':'0.8'}, 'higher embed LR fast lexical fit'), ('ns2_dt03', {'HYDRA_DT_BIAS_LR':'0.3'}, 'lower dt-bias LR stability'), ('ns2_dt10', {'HYDRA_DT_BIAS_LR':'1.0'}, 'higher dt-bias adaptation'), ('ns2_dstate96', {'HYDRA_D_STATE':'96'}, 'more SSM state capacity'), ('ns2_dstate128', {'HYDRA_D_STATE':'128'}, 'max SSM state capacity probe'), ('ns2_enk128', {'HYDRA_ENGRAM_TOPK':'128'}, 'wider engram retrieval'), ('ns2_enk32', {'HYDRA_ENGRAM_TOPK':'32'}, 'narrower engram retrieval / less noise'), ('ns2_htm64', {'HYDRA_HTM_SUBSAMPLE':'64'}, 'more frequent HTM update'), ('ns2_htm256', {'HYDRA_HTM_SUBSAMPLE':'256'}, 'less HTM overhead/noise'), ('ns2_gdn_3_7_11', {'HYDRA_GDN_LAYERS':'3,7,11'}, 'retest 3-GDN trend on NS2'), ('ns2_gdn_0_4_8_12_16', {'HYDRA_GDN_LAYERS':'0,4,8,12,16'}, '5-GDN distributed depth'), ('ns2_gdn_0_1_2', {'HYDRA_GDN_LAYERS':'0,1,2'}, 'early GDN locality'), ('ns2_l18', {'HYDRA_N_LAYER':'18'}, 'shallower depth for more updates in budget'), ('ns2_l22', {'HYDRA_N_LAYER':'22'}, 'deeper temporal hierarchy if fits'), ('ns2_d176', {'HYDRA_D_MODEL':'176'}, 'slightly wider model'), ('ns2_d192', {'HYDRA_D_MODEL':'192'}, 'wider model capacity probe'), ('ns3_gdn_3_7_11', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_GDN_LAYERS':'3,7,11'}, 'known GDN axis with stable Muon NS3'), ('ns3_tb32768_z001', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001'}, 'champion-ish optimizer defaults'), ] STEP_RE = re.compile(r'^step=\d+ .*?bpb=([0-9.]+).*?tps=([0-9.]+)', re.M) VAL_RE = re.compile(r'val_bpb:\s*([0-9.]+)') METRICS_RE = re.compile(r'\[METRICS_JSON\]\s*(\{.*\})') def current_commit() -> str: return subprocess.check_output(['git','rev-parse','--short','HEAD'], cwd=ROOT, text=True).strip() def completed_names() -> set[str]: done: set[str] = set() if not LEDGER.exists(): return done for line in LEDGER.read_text(errors='ignore').splitlines()[1:]: parts = line.split('\t') if len(parts) >= 3: done.add(parts[2]) return done def best_seen() -> float: best = 999.0 # Parse the TSV ledger first. Its rows are not `val_bpb:` log lines. if LEDGER.exists(): for line in LEDGER.read_text(errors='ignore').splitlines()[1:]: parts = line.split('\t') if len(parts) >= 4: try: v = float(parts[3]) except ValueError: continue if v > 0: best = min(best, v) # Also seed from known one-off receipts. for path in [ROOT/'run_tps11_ns2.log', ROOT/'run_tps7_bs10.log', ROOT/'run_tps1_htm256.log']: if not path.exists(): continue txt = path.read_text(errors='ignore') for m in VAL_RE.finditer(txt): best = min(best, float(m.group(1))) return best def parse_log(path: Path): txt = path.read_text(errors='ignore') if path.exists() else '' vals = [float(m.group(1)) for m in VAL_RE.finditer(txt)] pairs = [(float(a), float(b)) for a,b in STEP_RE.findall(txt)] tps = [b for _, b in pairs if b > 0] peak_tps = max(tps) if tps else 0.0 med_tps = sorted(tps)[len(tps)//2] if tps else 0.0 mem_gb = 0.0 metrics = None mm = list(METRICS_RE.finditer(txt)) if mm: try: metrics = json.loads(mm[-1].group(1)) mem_gb = float(metrics.get('peak_vram_mb', 0.0)) / 1024.0 except Exception: pass if vals: return vals[-1], peak_tps, med_tps, mem_gb, 'ok', metrics if 'out of memory' in txt.lower() or 'OutOfMemory' in txt or 'CUDA driver error: out of memory' in txt: return 0.0, peak_tps, med_tps, mem_gb, 'crash_oom', metrics if 'Traceback' in txt or 'RuntimeError' in txt or 'AssertionError' in txt: return 0.0, peak_tps, med_tps, mem_gb, 'crash', metrics return 0.0, peak_tps, med_tps, mem_gb, 'no_val', metrics def append(row: list[str]) -> None: with LEDGER.open('a') as f: f.write('\t'.join(row) + '\n') def perturb_candidates(round_idx: int): # Deterministic widening after first pass: combine the best-known NS2 with # small LR/zloss/GDN/engram perturbations. Keeps generating work forever. lrs = ['0.025','0.03','0.035','0.04','0.045','0.05'] embeds = ['0.45','0.55','0.6','0.7'] zloss = ['0.0001','0.0005','0.001','0.002'] gdns = ['', '3,7,11', '0,4,8,12,16', '0,1,2'] for i, (mlr, elr, zl, gdn) in enumerate(itertools.product(lrs, embeds, zloss, gdns)): name = f'auto_r{round_idx:02d}_{i:03d}' yield name, { 'HYDRA_MUON_NS_STEPS': '2', 'HYDRA_MATRIX_LR': mlr, 'HYDRA_EMBED_LR': elr, 'HYDRA_Z_LOSS_WEIGHT': zl, 'HYDRA_GDN_LAYERS': gdn, }, f'auto grid ns2 mlr={mlr} embed={elr} z={zl} gdn={gdn or "none"}' def run_candidate(name: str, delta: dict[str, str], desc: str, best: float): ts = time.strftime('%Y%m%d_%H%M%S') log = LOGDIR / f'{ts}_{name}.log' env = os.environ.copy() env.update(BASE) env.update(delta) cmd = ['taskset','-c','0-15', './.venv/bin/python', '-u', 'train.py'] print(f'[{time.strftime("%F %T")}] RUN {name} best={best:.6f} desc={desc}', flush=True) with log.open('w') as f: f.write(f'=== {name} ===\n') f.write(f'desc={desc}\n') f.write('env_delta=' + json.dumps(delta, sort_keys=True) + '\n') f.flush() try: rc = subprocess.run(cmd, cwd=ROOT, env=env, stdout=f, stderr=subprocess.STDOUT, timeout=RUN_TIMEOUT).returncode except subprocess.TimeoutExpired: rc = 124 f.write('\n[TIMEOUT]\n') val, peak, med, mem, status0, metrics = parse_log(log) if status0 == 'ok': status = 'keep' if val < best else 'discard' else: status = status0 append([ time.strftime('%F_%T'), current_commit(), name, f'{val:.6f}', f'{peak:.0f}', f'{med:.0f}', f'{mem:.2f}', status, desc.replace('\t',' '), str(log) ]) print(f'[{time.strftime("%F %T")}] DONE {name} val={val:.6f} peak={peak:.0f} med={med:.0f} mem={mem:.2f} status={status} log={log}', flush=True) return val if status == 'keep' else best, status def main(): best = best_seen() one_shot = os.environ.get('AUTORESEARCH_ONE_SHOT', '0') == '1' print(f'START autoresearch may03 best_seen={best:.6f} target={TARGET_BPB:.6f} one_shot={one_shot}', flush=True) round_idx = 0 done = completed_names() while True: stream = CANDIDATES if round_idx == 0 else list(perturb_candidates(round_idx)) for name, delta, desc in stream: if name in done: print(f'[{time.strftime("%F %T")}] SKIP {name} already ledgered', flush=True) continue best, status = run_candidate(name, delta, desc, best) done.add(name) if best <= TARGET_BPB: print(f'HARDGATE_REACHED best={best:.6f} target={TARGET_BPB:.6f}', flush=True) return # Let CUDA/WSL settle and reduce fragmentation. subprocess.run(['bash','-lc','python3 - <<"PY"\nimport torch\ntorch.cuda.empty_cache() if torch.cuda.is_available() else None\nPY'], cwd=ROOT, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if one_shot: print(f'ONE_SHOT_DONE best={best:.6f}', flush=True) return time.sleep(10) round_idx += 1 if one_shot: # No remaining unledgered candidates in the fixed queue; allow the # perturbation generator on the next cron tick instead of looping in # a long-lived process. print(f'ONE_SHOT_NO_FIXED_CANDIDATE best={best:.6f}', flush=True) return if __name__ == '__main__': main()