#!/usr/bin/env python3
from __future__ import annotations
"""Continuous Feather autoresearch loop for local RTX 3060.

Protocol:
- One GPU owner, sequential runs only.
- 300s training budget, redirected logs.
- Parse val_bpb / metrics JSON from disk.
- Append TSV ledger.
- Keep searching until hard gate is reached or process is killed.

This loop mutates runtime env first because current Feather exposes most active
architecture/optimizer knobs through HYDRA_* gates. Code edits can be added as
candidate generators after the env frontier is exhausted.
"""

import itertools
import json
import os
import re
import shlex
import subprocess
import time
from pathlib import Path

ROOT = Path('/home/mikeb/work/feather')
LOGDIR = ROOT / 'logs' / 'autoresearch_may03'
LEDGER = ROOT / 'autoresearch_may03_results.tsv'
TARGET_BPB = float(os.environ.get('AUTORESEARCH_TARGET_BPB', '1.60'))
# Strict autoresearch cadence: train.py gets HYDRA_TIME_BUDGET=300; wrapper only
# allows startup + final eval overhead. Do not let one candidate occupy the GPU
# for 10-12 minutes unless it is genuinely hung.
RUN_TIMEOUT = int(os.environ.get('AUTORESEARCH_RUN_TIMEOUT', '430'))

LOGDIR.mkdir(parents=True, exist_ok=True)
if not LEDGER.exists():
    LEDGER.write_text('ts\tcommit\tcandidate\tval_bpb\tpeak_tps\tmedian_tps\tmemory_gb\tstatus\tdescription\tlog\n')

BASE = {
    'LD_LIBRARY_PATH': '/usr/lib/wsl/lib:/usr/local/cuda/lib64',
    'PYTORCH_CUDA_ALLOC_CONF': 'expandable_segments:True',
    'HF_TOKEN': '',
    'HUGGINGFACE_HUB_TOKEN': '',
    'WANDB_DISABLED': 'true',
    'HYDRA_USE_NEMOTRON': '1',
    'HYDRA_USE_FULL_BLEND': '1',
    'HYDRA_SAMPLED_SOFTMAX': '1024',
    'HYDRA_SOFTCAP_CLAMP': '1',
    'HYDRA_SEQ_LEN': '1024',
    'HYDRA_HEADDIM': '32',
    'HYDRA_EXPAND': '3',
    'HYDRA_BATCH_SIZE': '8',
    'HYDRA_TOTAL_BATCH': '16384',
    'HYDRA_D_MODEL': '160',
    'HYDRA_N_LAYER': '20',
    'HYDRA_D_STATE': '64',
    'HYDRA_TIME_BUDGET': '300',
    'HYDRA_ENGRAM_N_COLUMNS': '16384',
    'HYDRA_ENGRAM_TOPK': '64',
    'HYDRA_GDN_LAYERS': '',
    'HYDRA_MTP_K': '1',
    'HYDRA_USE_MDLM': '0',
    'HYDRA_MUON_COMPILE': '0',
    'HYDRA_MUON_NS_STEPS': '2',  # promoted from TPS-11 receipt
    'HYDRA_MATRIX_LR': '0.04',
    'HYDRA_EMBED_LR': '0.6',
    'HYDRA_UNEMBED_LR': '0.004',
    'HYDRA_DT_BIAS_LR': '0.6',
    'HYDRA_LOCAL_SHARDS_ONLY': '1',
    'HYDRA_BACKGROUND_PREFETCH': '0',
    'HYDRA_STREAM_SHUFFLE_BUFFER': '256',
    'HYDRA_STREAM_PREFETCH': '16',
    'HYDRA_TOKEN_PREFETCH': '4',
    'HYDRA_TOKEN_CACHE_GB': '1',
    'HYDRA_CKPT_INTERVAL': '2000',
    'HYDRA_MID_VAL_INTERVAL': '0',
    'HYDRA_HESTIA_INTERVAL': '999999',
    'HYDRA_HTM_SUBSAMPLE': '128',
    'HYDRA_EVAL_BATCH': '1',
    'HYDRA_EVAL_TOKENS': '1024',
    'HYDRA_CE_CHUNK': '32',
    'HYDRA_SKIP_FACTUAL_EVAL': '1',
    'HYDRA_RESUME_CKPT': 'none',
    'UV_PYTHON': '/usr/bin/python3',
}

# Ordered from lowest-risk/promising to wider/radical. Infinite outer loop will
# revisit with perturbations after first pass.
CANDIDATES: list[tuple[str, dict[str, str], str]] = [
    # Plateau-escape candidates: stronger than tiny LR nudges. These attack
    # the 5-minute validation plateau by changing effective optimization,
    # temporal capacity, and memory pressure while keeping full architecture.
    # Real z-loss axis was tested after wiring fix: z=0.001 regressed
    # (2.0446 vs best 2.0237). Return to default z=1e-4 and mutate the
    # discovered l16/d192 basin more aggressively.
    ('basin_l16d192_lr085_emb11', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.085','HYDRA_EMBED_LR':'1.1'}, 'basin: l16d192 hotter LR default z'),
    ('basin_l16d192_lr10_emb13', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.10','HYDRA_EMBED_LR':'1.3'}, 'basin: l16d192 max hot LR default z'),
    ('basin_l16d192_lr065_emb09', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.065','HYDRA_EMBED_LR':'0.9'}, 'basin: l16d192 moderate LR default z'),
    ('basin_l16d192_ns1p5_nope_ns2_fasttb', {'HYDRA_TOTAL_BATCH':'24576','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 TB24576 more updates default z'),
    ('basin_l16d192_dstate48', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_D_STATE':'48','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 smaller d_state faster updates'),
    ('basin_l16d192_dstate80', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_D_STATE':'80','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: l16d192 d_state80 capacity'),
    ('basin_l18d160_hot_defaultz', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_N_LAYER':'18','HYDRA_D_MODEL':'160','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'basin: valid deeper l18d160 default z'),
    # High-leverage evolutionary front around the discovered winner l16/d192.
    # This is no longer tiny-knob search: change shape + optimizer together.
    ('evo_l16d192_lr075_10', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.075','HYDRA_EMBED_LR':'1.0'}, 'evo: l16d192 with hotter LR for 300s descent'),
    ('evo_l16d192_lr05_07', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.05','HYDRA_EMBED_LR':'0.7'}, 'evo: l16d192 slightly cooler stability'),
    ('evo_l16d208', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'208','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16 wider d208'),
    ('evo_l14d224', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'14','HYDRA_D_MODEL':'224','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l14 d224 speed/capacity trade'),
    ('evo_l12d256', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'12','HYDRA_D_MODEL':'256','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l12 d256 wide-frontier probe'),
    ('evo_l10d288', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'10','HYDRA_D_MODEL':'288','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l10 d288 radical width probe'),
    ('evo_l16d192_k768', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_SAMPLED_SOFTMAX':'768','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 lower sampled softmax for more updates'),
    ('evo_l16d192_k512', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_SAMPLED_SOFTMAX':'512','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 K512 throughput/calibration probe'),
    ('evo_l16d192_tb16384', {'HYDRA_TOTAL_BATCH':'16384','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'evo: l16d192 smaller TB more optimizer steps'),
    ('escape_tb32768_z001_ns2_lr_hi', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'plateau escape: faster 300s descent with champion TB/zloss'),
    ('escape_tb32768_z001_ns2_lr_lo', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.025','HYDRA_EMBED_LR':'0.45'}, 'plateau escape: lower LR calibration'),
    ('escape_tb32768_ns2_dstate96', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_D_STATE':'96'}, 'plateau escape: extra SSM state capacity'),
    ('escape_tb32768_ns2_l18_d176', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'18','HYDRA_D_MODEL':'176'}, 'plateau escape: trade depth for width at similar budget'),
    ('escape_tb32768_ns2_l16_d192', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_N_LAYER':'16','HYDRA_D_MODEL':'192'}, 'plateau escape: stronger width trade'),
    ('escape_tb32768_ns2_gdn3', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_GDN_LAYERS':'3,7,11'}, 'plateau escape: reintroduce known GDN quality axis'),
    ('escape_tb32768_ns2_gdn5', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_GDN_LAYERS':'0,4,8,12,16'}, 'plateau escape: distributed 5-GDN quality axis'),
    ('escape_tb32768_ns2_enk128', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_ENGRAM_TOPK':'128'}, 'plateau escape: wider engram read'),
    ('escape_tb32768_ns2_dr64', {'HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_SDR_DELTA_RANK':'64'}, 'plateau escape: wider SDR STE pipe despite prior weak amp'),
    ('escape_tb32768_ns3_lr_hi', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001','HYDRA_MATRIX_LR':'0.06','HYDRA_EMBED_LR':'0.8'}, 'plateau escape: stable NS3 plus faster LR'),
    ('ns2_lr_m003', {'HYDRA_MATRIX_LR':'0.03'}, 'slightly lower matrix LR stabilizer'),
    ('ns2_lr_m005', {'HYDRA_MATRIX_LR':'0.05'}, 'slightly higher matrix LR for faster 300s descent'),
    ('ns2_embed04', {'HYDRA_EMBED_LR':'0.4'}, 'lower embed LR calibration'),
    ('ns2_embed08', {'HYDRA_EMBED_LR':'0.8'}, 'higher embed LR fast lexical fit'),
    ('ns2_dt03', {'HYDRA_DT_BIAS_LR':'0.3'}, 'lower dt-bias LR stability'),
    ('ns2_dt10', {'HYDRA_DT_BIAS_LR':'1.0'}, 'higher dt-bias adaptation'),
    ('ns2_dstate96', {'HYDRA_D_STATE':'96'}, 'more SSM state capacity'),
    ('ns2_dstate128', {'HYDRA_D_STATE':'128'}, 'max SSM state capacity probe'),
    ('ns2_enk128', {'HYDRA_ENGRAM_TOPK':'128'}, 'wider engram retrieval'),
    ('ns2_enk32', {'HYDRA_ENGRAM_TOPK':'32'}, 'narrower engram retrieval / less noise'),
    ('ns2_htm64', {'HYDRA_HTM_SUBSAMPLE':'64'}, 'more frequent HTM update'),
    ('ns2_htm256', {'HYDRA_HTM_SUBSAMPLE':'256'}, 'less HTM overhead/noise'),
    ('ns2_gdn_3_7_11', {'HYDRA_GDN_LAYERS':'3,7,11'}, 'retest 3-GDN trend on NS2'),
    ('ns2_gdn_0_4_8_12_16', {'HYDRA_GDN_LAYERS':'0,4,8,12,16'}, '5-GDN distributed depth'),
    ('ns2_gdn_0_1_2', {'HYDRA_GDN_LAYERS':'0,1,2'}, 'early GDN locality'),
    ('ns2_l18', {'HYDRA_N_LAYER':'18'}, 'shallower depth for more updates in budget'),
    ('ns2_l22', {'HYDRA_N_LAYER':'22'}, 'deeper temporal hierarchy if fits'),
    ('ns2_d176', {'HYDRA_D_MODEL':'176'}, 'slightly wider model'),
    ('ns2_d192', {'HYDRA_D_MODEL':'192'}, 'wider model capacity probe'),
    ('ns3_gdn_3_7_11', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_GDN_LAYERS':'3,7,11'}, 'known GDN axis with stable Muon NS3'),
    ('ns3_tb32768_z001', {'HYDRA_MUON_NS_STEPS':'3','HYDRA_TOTAL_BATCH':'32768','HYDRA_Z_LOSS_WEIGHT':'0.001'}, 'champion-ish optimizer defaults'),
]

STEP_RE = re.compile(r'^step=\d+ .*?bpb=([0-9.]+).*?tps=([0-9.]+)', re.M)
VAL_RE = re.compile(r'val_bpb:\s*([0-9.]+)')
METRICS_RE = re.compile(r'\[METRICS_JSON\]\s*(\{.*\})')


def current_commit() -> str:
    return subprocess.check_output(['git','rev-parse','--short','HEAD'], cwd=ROOT, text=True).strip()


def completed_names() -> set[str]:
    done: set[str] = set()
    if not LEDGER.exists():
        return done
    for line in LEDGER.read_text(errors='ignore').splitlines()[1:]:
        parts = line.split('\t')
        if len(parts) >= 3:
            done.add(parts[2])
    return done


def best_seen() -> float:
    best = 999.0
    # Parse the TSV ledger first. Its rows are not `val_bpb:` log lines.
    if LEDGER.exists():
        for line in LEDGER.read_text(errors='ignore').splitlines()[1:]:
            parts = line.split('\t')
            if len(parts) >= 4:
                try:
                    v = float(parts[3])
                except ValueError:
                    continue
                if v > 0:
                    best = min(best, v)
    # Also seed from known one-off receipts.
    for path in [ROOT/'run_tps11_ns2.log', ROOT/'run_tps7_bs10.log', ROOT/'run_tps1_htm256.log']:
        if not path.exists():
            continue
        txt = path.read_text(errors='ignore')
        for m in VAL_RE.finditer(txt):
            best = min(best, float(m.group(1)))
    return best


def parse_log(path: Path):
    txt = path.read_text(errors='ignore') if path.exists() else ''
    vals = [float(m.group(1)) for m in VAL_RE.finditer(txt)]
    pairs = [(float(a), float(b)) for a,b in STEP_RE.findall(txt)]
    tps = [b for _, b in pairs if b > 0]
    peak_tps = max(tps) if tps else 0.0
    med_tps = sorted(tps)[len(tps)//2] if tps else 0.0
    mem_gb = 0.0
    metrics = None
    mm = list(METRICS_RE.finditer(txt))
    if mm:
        try:
            metrics = json.loads(mm[-1].group(1))
            mem_gb = float(metrics.get('peak_vram_mb', 0.0)) / 1024.0
        except Exception:
            pass
    if vals:
        return vals[-1], peak_tps, med_tps, mem_gb, 'ok', metrics
    if 'out of memory' in txt.lower() or 'OutOfMemory' in txt or 'CUDA driver error: out of memory' in txt:
        return 0.0, peak_tps, med_tps, mem_gb, 'crash_oom', metrics
    if 'Traceback' in txt or 'RuntimeError' in txt or 'AssertionError' in txt:
        return 0.0, peak_tps, med_tps, mem_gb, 'crash', metrics
    return 0.0, peak_tps, med_tps, mem_gb, 'no_val', metrics


def append(row: list[str]) -> None:
    with LEDGER.open('a') as f:
        f.write('\t'.join(row) + '\n')


def perturb_candidates(round_idx: int):
    # Deterministic widening after first pass: combine the best-known NS2 with
    # small LR/zloss/GDN/engram perturbations. Keeps generating work forever.
    lrs = ['0.025','0.03','0.035','0.04','0.045','0.05']
    embeds = ['0.45','0.55','0.6','0.7']
    zloss = ['0.0001','0.0005','0.001','0.002']
    gdns = ['', '3,7,11', '0,4,8,12,16', '0,1,2']
    for i, (mlr, elr, zl, gdn) in enumerate(itertools.product(lrs, embeds, zloss, gdns)):
        name = f'auto_r{round_idx:02d}_{i:03d}'
        yield name, {
            'HYDRA_MUON_NS_STEPS': '2',
            'HYDRA_MATRIX_LR': mlr,
            'HYDRA_EMBED_LR': elr,
            'HYDRA_Z_LOSS_WEIGHT': zl,
            'HYDRA_GDN_LAYERS': gdn,
        }, f'auto grid ns2 mlr={mlr} embed={elr} z={zl} gdn={gdn or "none"}'


def run_candidate(name: str, delta: dict[str, str], desc: str, best: float):
    ts = time.strftime('%Y%m%d_%H%M%S')
    log = LOGDIR / f'{ts}_{name}.log'
    env = os.environ.copy()
    env.update(BASE)
    env.update(delta)
    cmd = ['taskset','-c','0-15', './.venv/bin/python', '-u', 'train.py']
    print(f'[{time.strftime("%F %T")}] RUN {name} best={best:.6f} desc={desc}', flush=True)
    with log.open('w') as f:
        f.write(f'=== {name} ===\n')
        f.write(f'desc={desc}\n')
        f.write('env_delta=' + json.dumps(delta, sort_keys=True) + '\n')
        f.flush()
        try:
            rc = subprocess.run(cmd, cwd=ROOT, env=env, stdout=f, stderr=subprocess.STDOUT, timeout=RUN_TIMEOUT).returncode
        except subprocess.TimeoutExpired:
            rc = 124
            f.write('\n[TIMEOUT]\n')
    val, peak, med, mem, status0, metrics = parse_log(log)
    if status0 == 'ok':
        status = 'keep' if val < best else 'discard'
    else:
        status = status0
    append([
        time.strftime('%F_%T'), current_commit(), name, f'{val:.6f}', f'{peak:.0f}', f'{med:.0f}', f'{mem:.2f}', status, desc.replace('\t',' '), str(log)
    ])
    print(f'[{time.strftime("%F %T")}] DONE {name} val={val:.6f} peak={peak:.0f} med={med:.0f} mem={mem:.2f} status={status} log={log}', flush=True)
    return val if status == 'keep' else best, status


def main():
    best = best_seen()
    one_shot = os.environ.get('AUTORESEARCH_ONE_SHOT', '0') == '1'
    print(f'START autoresearch may03 best_seen={best:.6f} target={TARGET_BPB:.6f} one_shot={one_shot}', flush=True)
    round_idx = 0
    done = completed_names()
    while True:
        stream = CANDIDATES if round_idx == 0 else list(perturb_candidates(round_idx))
        for name, delta, desc in stream:
            if name in done:
                print(f'[{time.strftime("%F %T")}] SKIP {name} already ledgered', flush=True)
                continue
            best, status = run_candidate(name, delta, desc, best)
            done.add(name)
            if best <= TARGET_BPB:
                print(f'HARDGATE_REACHED best={best:.6f} target={TARGET_BPB:.6f}', flush=True)
                return
            # Let CUDA/WSL settle and reduce fragmentation.
            subprocess.run(['bash','-lc','python3 - <<"PY"\nimport torch\ntorch.cuda.empty_cache() if torch.cuda.is_available() else None\nPY'], cwd=ROOT, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            if one_shot:
                print(f'ONE_SHOT_DONE best={best:.6f}', flush=True)
                return
            time.sleep(10)
        round_idx += 1
        if one_shot:
            # No remaining unledgered candidates in the fixed queue; allow the
            # perturbation generator on the next cron tick instead of looping in
            # a long-lived process.
            print(f'ONE_SHOT_NO_FIXED_CANDIDATE best={best:.6f}', flush=True)
            return

if __name__ == '__main__':
    main()