#!/usr/bin/env python3 """HYDRA Autoresearch Mutation Loop. Runs baseline training -> evaluates -> picks ONE mutation at a time -> trains -> evaluates -> keeps if quality improves AND tps >= floor. Repeats until all mutations exhausted or Ctrl+C. State persisted in .omc/autoresearch_config.json for resume support. Usage: python scripts/autoresearch.py # run full loop python scripts/autoresearch.py --dry-run # show plan, don't train python scripts/autoresearch.py --baseline # only run baseline eval """ from __future__ import annotations import argparse import json import math import os import re import signal import subprocess import sys import time from pathlib import Path _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _PROJECT_ROOT not in sys.path: sys.path.insert(0, _PROJECT_ROOT) # --------------------------------------------------------------------------- # Mutation catalog (ordered by expected impact) # --------------------------------------------------------------------------- MUTATIONS = [ # Learning dynamics — env vars verified in hydra/config.py {"name": "lr_matrix_0.012", "env": "HYDRA_MATRIX_LR=0.012"}, # default 0.12 {"name": "lr_matrix_0.06", "env": "HYDRA_MATRIX_LR=0.06"}, # half default {"name": "lr_matrix_0.24", "env": "HYDRA_MATRIX_LR=0.24"}, # double default {"name": "lr_floor_50pct", "env": "HYDRA_LR_MIN_MULT=0.5"}, # default 0.0 {"name": "lr_floor_20pct", "env": "HYDRA_LR_MIN_MULT=0.2"}, # default 0.0 {"name": "embed_lr_0.5", "env": "HYDRA_EMBED_LR=0.5"}, # default 1.0 {"name": "embed_lr_2.0", "env": "HYDRA_EMBED_LR=2.0"}, # default 1.0 {"name": "unembed_lr_0.01", "env": "HYDRA_UNEMBED_LR=0.01"}, # default 0.005 # Architecture — env vars verified in hydra/config.py {"name": "d_model_384", "env": "HYDRA_D_MODEL=384"}, # default 256 {"name": "d_model_192", "env": "HYDRA_D_MODEL=192"}, # smaller {"name": "d_state_128", "env": "HYDRA_D_STATE=128"}, # default 64 {"name": "d_state_32", "env": "HYDRA_D_STATE=32"}, # smaller {"name": "n_layer_6", "env": "HYDRA_N_LAYER=6"}, # default 4 {"name": "n_layer_3", "env": "HYDRA_N_LAYER=3"}, # fewer {"name": "headdim_16", "env": "HYDRA_HEADDIM=16"}, # default 32 -> more heads {"name": "headdim_64", "env": "HYDRA_HEADDIM=64"}, # default 32 -> fewer heads {"name": "expand_3", "env": "HYDRA_EXPAND=3"}, # default 2 {"name": "engram_2048", "env": "HYDRA_ENGRAM_N_COLUMNS=2048"}, # default 1024 {"name": "engram_4096", "env": "HYDRA_ENGRAM_N_COLUMNS=4096"}, # default 1024 {"name": "engram_512", "env": "HYDRA_ENGRAM_N_COLUMNS=512"}, # smaller # Batch size {"name": "batch_32k", "env": "HYDRA_TOTAL_BATCH=32768"}, # default 32768 (verify) {"name": "batch_16k", "env": "HYDRA_TOTAL_BATCH=16384"}, # smaller batch {"name": "batch_65k", "env": "HYDRA_TOTAL_BATCH=65536"}, # larger batch # Regularization — env vars verified in hydra/model.py + hydra/config.py {"name": "dropout_0.05", "env": "HYDRA_DROPOUT=0.05"}, # default 0.2 {"name": "dropout_0.1", "env": "HYDRA_DROPOUT=0.1"}, # default 0.2 {"name": "dropout_0.3", "env": "HYDRA_DROPOUT=0.3"}, # higher ] # --------------------------------------------------------------------------- # State management # --------------------------------------------------------------------------- STATE_DIR = os.path.join(_PROJECT_ROOT, ".omc") STATE_FILE = os.path.join(STATE_DIR, "autoresearch_config.json") DEFAULT_STATE = { "baseline_quality": None, "baseline_tps": None, "current_gen": 0, "mutations_tested": [], "mutations_kept": [], "tps_floor": 62000, "time_budget": 600, "history": [], } def load_state() -> dict: """Load state from disk or return default.""" if os.path.exists(STATE_FILE): with open(STATE_FILE, "r") as f: state = json.load(f) # Backfill missing keys from defaults for k, v in DEFAULT_STATE.items(): if k not in state: state[k] = v return state return dict(DEFAULT_STATE) def save_state(state: dict) -> None: """Persist state to disk.""" os.makedirs(STATE_DIR, exist_ok=True) with open(STATE_FILE, "w") as f: json.dump(state, f, indent=2) # --------------------------------------------------------------------------- # Training subprocess # --------------------------------------------------------------------------- def build_env(extra_env: str | None = None) -> dict[str, str]: """Build environment for training subprocess.""" env = os.environ.copy() # Ensure CUDA paths ld_paths = ["/usr/lib/wsl/lib", "/usr/local/cuda/lib64"] existing = env.get("LD_LIBRARY_PATH", "") for p in ld_paths: if p not in existing: existing = p + ":" + existing env["LD_LIBRARY_PATH"] = existing # Apply mutation env var if extra_env: key, val = extra_env.split("=", 1) env[key] = val return env def run_training(time_budget: int, extra_env: str | None = None) -> dict | None: """Run train.py with given time budget and optional env override. Returns dict with parsed metrics, or None on failure. """ env = build_env(extra_env) env["HYDRA_TIME_BUDGET"] = str(time_budget) cmd = [os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), "-u", "train.py"] try: proc = subprocess.Popen( cmd, cwd=_PROJECT_ROOT, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, ) except Exception as e: print(f" [ERROR] Failed to start training: {e}") return None output_lines: list[str] = [] last_step_line = "" try: for line in proc.stdout: line = line.rstrip() output_lines.append(line) if line.startswith("step="): last_step_line = line # Print progress every 50 steps m = re.search(r"step=(\d+)", line) if m and int(m.group(1)) % 50 == 0: tps_m = re.search(r"tps=(\d+)", line) bpb_m = re.search(r"bpb=([\d.]+)", line) tps = tps_m.group(1) if tps_m else "?" bpb = bpb_m.group(1) if bpb_m else "?" print(f" step={m.group(1)} tps={tps} bpb={bpb}", flush=True) elif "val_bpb" in line or "factual_english_score" in line: print(f" {line}", flush=True) except KeyboardInterrupt: proc.terminate() proc.wait() raise proc.wait() if proc.returncode != 0: print(f" [ERROR] Training exited with code {proc.returncode}") # Print last 10 lines for debugging for line in output_lines[-10:]: print(f" {line}") return None return _parse_training_output(output_lines) def _parse_training_output(lines: list[str]) -> dict: """Extract metrics from training output lines.""" metrics: dict[str, float] = {} for line in lines: # Key=value pairs from summary block for key in ["val_bpb", "training_seconds", "peak_vram_mb", "mfu_percent", "total_tokens_M", "num_steps", "factual_english_score", "factual_english_hits"]: m = re.match(rf"^{key}:\s+([\d.]+)", line.strip()) if m: metrics[key] = float(m.group(1)) # TPS from last step line if line.startswith("step="): tps_m = re.search(r"tps=(\d+)", line) if tps_m: metrics["tps"] = float(tps_m.group(1)) return metrics # --------------------------------------------------------------------------- # Eval integration # --------------------------------------------------------------------------- def run_eval_after_training(extra_env: str | None = None) -> dict | None: """Run eval_quality.py after training. Returns metrics dict or None.""" env = build_env(extra_env) cmd = [ os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), os.path.join(_PROJECT_ROOT, "scripts", "eval_quality.py"), ] try: result = subprocess.run( cmd, cwd=_PROJECT_ROOT, env=env, capture_output=True, text=True, timeout=120, # 2 min max for eval ) except subprocess.TimeoutExpired: print(" [ERROR] Eval timed out (120s)") return None except Exception as e: print(f" [ERROR] Eval failed: {e}") return None if result.returncode != 0: print(f" [ERROR] Eval exited with code {result.returncode}") for line in result.stdout.split("\n")[-10:]: print(f" {line}") for line in result.stderr.split("\n")[-5:]: print(f" {line}") return None # Parse key=value output metrics = {} for line in result.stdout.split("\n"): line = line.strip() m = re.match(r"^([\w]+)=([\d.eE+-]+)$", line) if m: try: metrics[m.group(1)] = float(m.group(2)) except ValueError: pass return metrics if metrics else None # --------------------------------------------------------------------------- # Git operations # --------------------------------------------------------------------------- def git_commit(message: str) -> bool: """Stage all changes and commit.""" try: subprocess.run(["git", "add", "-A"], cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30) subprocess.run( ["git", "commit", "-m", message], cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30, ) return True except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: print(f" [WARN] Git commit failed: {e}") return False # --------------------------------------------------------------------------- # Main loop # --------------------------------------------------------------------------- _SHUTDOWN = False def _handle_sigint(signum, frame): global _SHUTDOWN if _SHUTDOWN: print("\n[AUTORESEARCH] Double Ctrl+C — force exit") sys.exit(1) _SHUTDOWN = True print("\n[AUTORESEARCH] Ctrl+C received — finishing current gen then saving state...") def main(): global _SHUTDOWN signal.signal(signal.SIGINT, _handle_sigint) parser = argparse.ArgumentParser(description="HYDRA autoresearch mutation loop") parser.add_argument("--dry-run", action="store_true", help="Show plan, don't train") parser.add_argument("--baseline", action="store_true", help="Only run baseline") parser.add_argument("--time-budget", type=int, default=600, help="Time budget per run (s)") parser.add_argument("--tps-floor", type=int, default=62000, help="Minimum acceptable TPS") args = parser.parse_args() state = load_state() state["time_budget"] = args.time_budget state["tps_floor"] = args.tps_floor tested = set(state["mutations_tested"]) remaining = [m for m in MUTATIONS if m["name"] not in tested] print("=" * 70) print("HYDRA AUTORESEARCH MUTATION LOOP") print("=" * 70) print(f"Time budget per run: {state['time_budget']}s") print(f"TPS floor: {state['tps_floor']}") print(f"Current gen: {state['current_gen']}") print(f"Mutations tested: {len(tested)}/{len(MUTATIONS)}") print(f"Mutations kept: {state['mutations_kept']}") print(f"Remaining: {[m['name'] for m in remaining]}") print() if args.dry_run: print("[DRY RUN] Would test these mutations in order:") for i, m in enumerate(remaining): print(f" {i + 1}. {m['name']} ({m['env']})") return # ----------------------------------------------------------------------- # Baseline (Gen 0) # ----------------------------------------------------------------------- if state["baseline_quality"] is None: print("[GEN 0] Running baseline training + evaluation...") train_metrics = run_training(state["time_budget"]) if train_metrics is None: print("[FAIL] Baseline training failed") save_state(state) return print("[GEN 0] Running quality evaluation...") eval_metrics = run_eval_after_training() if eval_metrics is None: print("[FAIL] Baseline eval failed") save_state(state) return baseline_tps = train_metrics.get("tps", 0) baseline_quality = eval_metrics.get("quality_score", 0) state["baseline_quality"] = baseline_quality state["baseline_tps"] = baseline_tps state["current_gen"] = 0 state["history"].append({ "gen": 0, "mutation": "baseline", "quality_score": baseline_quality, "baseline_score": baseline_quality, "delta": "0.0%", "tps": baseline_tps, "ppl": eval_metrics.get("ppl", 0), "bleu4": eval_metrics.get("bleu4", 0), "rouge_l": eval_metrics.get("rouge_l", 0), "factual": eval_metrics.get("factual", 0), "bpb": eval_metrics.get("bpb", 0), "repetition_rate": eval_metrics.get("repetition_rate", 0), "kept": True, }) save_state(state) print(f"[GEN 0] BASELINE: quality={baseline_quality:.4f} tps={baseline_tps:.0f}") if args.baseline: return else: print(f"[RESUME] Baseline quality={state['baseline_quality']:.4f} tps={state['baseline_tps']:.0f}") if args.baseline: return # ----------------------------------------------------------------------- # Mutation loop # ----------------------------------------------------------------------- current_quality = state["baseline_quality"] # Track best quality so far (from last kept mutation, not just baseline) if state["history"]: kept_entries = [h for h in state["history"] if h.get("kept")] if kept_entries: current_quality = kept_entries[-1]["quality_score"] for mutation in remaining: if _SHUTDOWN: print("[AUTORESEARCH] Shutdown requested — saving state") save_state(state) return gen = state["current_gen"] + 1 name = mutation["name"] env_str = mutation["env"] print(f"\n[GEN {gen}] Testing {name} ({env_str})...") print(f" Current best quality: {current_quality:.4f}") # Train with mutation print(f" Training ({state['time_budget']}s)...", flush=True) train_metrics = run_training(state["time_budget"], extra_env=env_str) if train_metrics is None: print(f" [SKIP] Training failed for {name}") state["mutations_tested"].append(name) state["current_gen"] = gen state["history"].append({ "gen": gen, "mutation": name, "quality_score": 0, "baseline_score": current_quality, "delta": "FAIL", "tps": 0, "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0, "kept": False, }) save_state(state) continue tps = train_metrics.get("tps", 0) # TPS floor check if tps < state["tps_floor"]: print(f" [REJECT] TPS={tps:.0f} < floor={state['tps_floor']} — skipping eval") state["mutations_tested"].append(name) state["current_gen"] = gen state["history"].append({ "gen": gen, "mutation": name, "quality_score": 0, "baseline_score": current_quality, "delta": f"TPS_FAIL({tps:.0f})", "tps": tps, "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0, "bpb": train_metrics.get("val_bpb", 0), "repetition_rate": 0, "kept": False, }) save_state(state) continue # Evaluate print(f" Evaluating...", flush=True) eval_metrics = run_eval_after_training(extra_env=env_str) if eval_metrics is None: print(f" [SKIP] Eval failed for {name}") state["mutations_tested"].append(name) state["current_gen"] = gen state["history"].append({ "gen": gen, "mutation": name, "quality_score": 0, "baseline_score": current_quality, "delta": "EVAL_FAIL", "tps": tps, "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0, "kept": False, }) save_state(state) continue quality = eval_metrics.get("quality_score", 0) delta_pct = ((quality - current_quality) / max(abs(current_quality), 1e-6)) * 100 delta_str = f"{delta_pct:+.1f}%" kept = quality > current_quality and tps >= state["tps_floor"] status = "KEEP" if kept else "DISCARD" entry = { "gen": gen, "mutation": name, "quality_score": quality, "baseline_score": current_quality, "delta": delta_str, "tps": tps, "ppl": eval_metrics.get("ppl", 0), "bleu4": eval_metrics.get("bleu4", 0), "rouge_l": eval_metrics.get("rouge_l", 0), "factual": eval_metrics.get("factual", 0), "bpb": eval_metrics.get("bpb", 0), "repetition_rate": eval_metrics.get("repetition_rate", 0), "kept": kept, } print(f"\n[GEN {gen}] {name}: quality={quality:.4f} ({delta_str}) tps={tps:.0f} -> {status}") if kept: current_quality = quality state["mutations_kept"].append(name) git_commit(f"autoresearch: gen {gen} — {name} quality {delta_str}") state["mutations_tested"].append(name) state["current_gen"] = gen state["history"].append(entry) save_state(state) # ----------------------------------------------------------------------- # Summary # ----------------------------------------------------------------------- print("\n" + "=" * 70) print("AUTORESEARCH COMPLETE") print("=" * 70) print(f"Total generations: {state['current_gen']}") print(f"Mutations kept: {state['mutations_kept']}") print(f"Final quality: {current_quality:.4f}") if state["baseline_quality"]: total_delta = ((current_quality - state["baseline_quality"]) / max(abs(state["baseline_quality"]), 1e-6)) * 100 print(f"Total improvement: {total_delta:+.1f}%") print() # Print history table print(f"{'Gen':>4} {'Mutation':>20} {'Quality':>8} {'Delta':>8} {'TPS':>7} {'PPL':>8} {'BPB':>7} {'Kept':>5}") print("-" * 75) for h in state["history"]: print(f"{h['gen']:4d} {h['mutation']:>20s} {h['quality_score']:8.4f} " f"{h['delta']:>8s} {h['tps']:7.0f} {h['ppl']:8.2f} " f"{h.get('bpb', 0):7.4f} {' YES' if h['kept'] else ' NO'}") if __name__ == "__main__": main()