| |
| """HYDRA Autoresearch Mutation Loop. |
| |
| Runs baseline training -> evaluates -> picks ONE mutation at a time -> |
| trains -> evaluates -> keeps if quality improves AND tps >= floor. |
| Repeats until all mutations exhausted or Ctrl+C. |
| |
| State persisted in .omc/autoresearch_config.json for resume support. |
| |
| Usage: |
| python scripts/autoresearch.py # run full loop |
| python scripts/autoresearch.py --dry-run # show plan, don't train |
| python scripts/autoresearch.py --baseline # only run baseline eval |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import math |
| import os |
| import re |
| import signal |
| import subprocess |
| import sys |
| import time |
| from pathlib import Path |
|
|
| _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| if _PROJECT_ROOT not in sys.path: |
| sys.path.insert(0, _PROJECT_ROOT) |
|
|
| |
| |
| |
|
|
| MUTATIONS = [ |
| |
| {"name": "lr_matrix_0.012", "env": "HYDRA_MATRIX_LR=0.012"}, |
| {"name": "lr_matrix_0.06", "env": "HYDRA_MATRIX_LR=0.06"}, |
| {"name": "lr_matrix_0.24", "env": "HYDRA_MATRIX_LR=0.24"}, |
| {"name": "lr_floor_50pct", "env": "HYDRA_LR_MIN_MULT=0.5"}, |
| {"name": "lr_floor_20pct", "env": "HYDRA_LR_MIN_MULT=0.2"}, |
| {"name": "embed_lr_0.5", "env": "HYDRA_EMBED_LR=0.5"}, |
| {"name": "embed_lr_2.0", "env": "HYDRA_EMBED_LR=2.0"}, |
| {"name": "unembed_lr_0.01", "env": "HYDRA_UNEMBED_LR=0.01"}, |
| |
| {"name": "d_model_384", "env": "HYDRA_D_MODEL=384"}, |
| {"name": "d_model_192", "env": "HYDRA_D_MODEL=192"}, |
| {"name": "d_state_128", "env": "HYDRA_D_STATE=128"}, |
| {"name": "d_state_32", "env": "HYDRA_D_STATE=32"}, |
| {"name": "n_layer_6", "env": "HYDRA_N_LAYER=6"}, |
| {"name": "n_layer_3", "env": "HYDRA_N_LAYER=3"}, |
| {"name": "headdim_16", "env": "HYDRA_HEADDIM=16"}, |
| {"name": "headdim_64", "env": "HYDRA_HEADDIM=64"}, |
| {"name": "expand_3", "env": "HYDRA_EXPAND=3"}, |
| {"name": "engram_2048", "env": "HYDRA_ENGRAM_N_COLUMNS=2048"}, |
| {"name": "engram_4096", "env": "HYDRA_ENGRAM_N_COLUMNS=4096"}, |
| {"name": "engram_512", "env": "HYDRA_ENGRAM_N_COLUMNS=512"}, |
| |
| {"name": "batch_32k", "env": "HYDRA_TOTAL_BATCH=32768"}, |
| {"name": "batch_16k", "env": "HYDRA_TOTAL_BATCH=16384"}, |
| {"name": "batch_65k", "env": "HYDRA_TOTAL_BATCH=65536"}, |
| |
| {"name": "dropout_0.05", "env": "HYDRA_DROPOUT=0.05"}, |
| {"name": "dropout_0.1", "env": "HYDRA_DROPOUT=0.1"}, |
| {"name": "dropout_0.3", "env": "HYDRA_DROPOUT=0.3"}, |
| ] |
|
|
| |
| |
| |
|
|
| STATE_DIR = os.path.join(_PROJECT_ROOT, ".omc") |
| STATE_FILE = os.path.join(STATE_DIR, "autoresearch_config.json") |
|
|
| DEFAULT_STATE = { |
| "baseline_quality": None, |
| "baseline_tps": None, |
| "current_gen": 0, |
| "mutations_tested": [], |
| "mutations_kept": [], |
| "tps_floor": 62000, |
| "time_budget": 600, |
| "history": [], |
| } |
|
|
|
|
| def load_state() -> dict: |
| """Load state from disk or return default.""" |
| if os.path.exists(STATE_FILE): |
| with open(STATE_FILE, "r") as f: |
| state = json.load(f) |
| |
| for k, v in DEFAULT_STATE.items(): |
| if k not in state: |
| state[k] = v |
| return state |
| return dict(DEFAULT_STATE) |
|
|
|
|
| def save_state(state: dict) -> None: |
| """Persist state to disk.""" |
| os.makedirs(STATE_DIR, exist_ok=True) |
| with open(STATE_FILE, "w") as f: |
| json.dump(state, f, indent=2) |
|
|
|
|
| |
| |
| |
|
|
| def build_env(extra_env: str | None = None) -> dict[str, str]: |
| """Build environment for training subprocess.""" |
| env = os.environ.copy() |
| |
| ld_paths = ["/usr/lib/wsl/lib", "/usr/local/cuda/lib64"] |
| existing = env.get("LD_LIBRARY_PATH", "") |
| for p in ld_paths: |
| if p not in existing: |
| existing = p + ":" + existing |
| env["LD_LIBRARY_PATH"] = existing |
|
|
| |
| if extra_env: |
| key, val = extra_env.split("=", 1) |
| env[key] = val |
|
|
| return env |
|
|
|
|
| def run_training(time_budget: int, extra_env: str | None = None) -> dict | None: |
| """Run train.py with given time budget and optional env override. |
| |
| Returns dict with parsed metrics, or None on failure. |
| """ |
| env = build_env(extra_env) |
| env["HYDRA_TIME_BUDGET"] = str(time_budget) |
|
|
| cmd = [os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), "-u", "train.py"] |
|
|
| try: |
| proc = subprocess.Popen( |
| cmd, |
| cwd=_PROJECT_ROOT, |
| env=env, |
| stdout=subprocess.PIPE, |
| stderr=subprocess.STDOUT, |
| text=True, |
| bufsize=1, |
| ) |
| except Exception as e: |
| print(f" [ERROR] Failed to start training: {e}") |
| return None |
|
|
| output_lines: list[str] = [] |
| last_step_line = "" |
|
|
| try: |
| for line in proc.stdout: |
| line = line.rstrip() |
| output_lines.append(line) |
| if line.startswith("step="): |
| last_step_line = line |
| |
| m = re.search(r"step=(\d+)", line) |
| if m and int(m.group(1)) % 50 == 0: |
| tps_m = re.search(r"tps=(\d+)", line) |
| bpb_m = re.search(r"bpb=([\d.]+)", line) |
| tps = tps_m.group(1) if tps_m else "?" |
| bpb = bpb_m.group(1) if bpb_m else "?" |
| print(f" step={m.group(1)} tps={tps} bpb={bpb}", flush=True) |
| elif "val_bpb" in line or "factual_english_score" in line: |
| print(f" {line}", flush=True) |
| except KeyboardInterrupt: |
| proc.terminate() |
| proc.wait() |
| raise |
|
|
| proc.wait() |
| if proc.returncode != 0: |
| print(f" [ERROR] Training exited with code {proc.returncode}") |
| |
| for line in output_lines[-10:]: |
| print(f" {line}") |
| return None |
|
|
| return _parse_training_output(output_lines) |
|
|
|
|
| def _parse_training_output(lines: list[str]) -> dict: |
| """Extract metrics from training output lines.""" |
| metrics: dict[str, float] = {} |
|
|
| for line in lines: |
| |
| for key in ["val_bpb", "training_seconds", "peak_vram_mb", "mfu_percent", |
| "total_tokens_M", "num_steps", "factual_english_score", |
| "factual_english_hits"]: |
| m = re.match(rf"^{key}:\s+([\d.]+)", line.strip()) |
| if m: |
| metrics[key] = float(m.group(1)) |
|
|
| |
| if line.startswith("step="): |
| tps_m = re.search(r"tps=(\d+)", line) |
| if tps_m: |
| metrics["tps"] = float(tps_m.group(1)) |
|
|
| return metrics |
|
|
|
|
| |
| |
| |
|
|
| def run_eval_after_training(extra_env: str | None = None) -> dict | None: |
| """Run eval_quality.py after training. Returns metrics dict or None.""" |
| env = build_env(extra_env) |
| cmd = [ |
| os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), |
| os.path.join(_PROJECT_ROOT, "scripts", "eval_quality.py"), |
| ] |
|
|
| try: |
| result = subprocess.run( |
| cmd, |
| cwd=_PROJECT_ROOT, |
| env=env, |
| capture_output=True, |
| text=True, |
| timeout=120, |
| ) |
| except subprocess.TimeoutExpired: |
| print(" [ERROR] Eval timed out (120s)") |
| return None |
| except Exception as e: |
| print(f" [ERROR] Eval failed: {e}") |
| return None |
|
|
| if result.returncode != 0: |
| print(f" [ERROR] Eval exited with code {result.returncode}") |
| for line in result.stdout.split("\n")[-10:]: |
| print(f" {line}") |
| for line in result.stderr.split("\n")[-5:]: |
| print(f" {line}") |
| return None |
|
|
| |
| metrics = {} |
| for line in result.stdout.split("\n"): |
| line = line.strip() |
| m = re.match(r"^([\w]+)=([\d.eE+-]+)$", line) |
| if m: |
| try: |
| metrics[m.group(1)] = float(m.group(2)) |
| except ValueError: |
| pass |
|
|
| return metrics if metrics else None |
|
|
|
|
| |
| |
| |
|
|
| def git_commit(message: str) -> bool: |
| """Stage all changes and commit.""" |
| try: |
| subprocess.run(["git", "add", "-A"], cwd=_PROJECT_ROOT, check=True, |
| capture_output=True, timeout=30) |
| subprocess.run( |
| ["git", "commit", "-m", message], |
| cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30, |
| ) |
| return True |
| except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: |
| print(f" [WARN] Git commit failed: {e}") |
| return False |
|
|
|
|
| |
| |
| |
|
|
| _SHUTDOWN = False |
|
|
|
|
| def _handle_sigint(signum, frame): |
| global _SHUTDOWN |
| if _SHUTDOWN: |
| print("\n[AUTORESEARCH] Double Ctrl+C — force exit") |
| sys.exit(1) |
| _SHUTDOWN = True |
| print("\n[AUTORESEARCH] Ctrl+C received — finishing current gen then saving state...") |
|
|
|
|
| def main(): |
| global _SHUTDOWN |
| signal.signal(signal.SIGINT, _handle_sigint) |
|
|
| parser = argparse.ArgumentParser(description="HYDRA autoresearch mutation loop") |
| parser.add_argument("--dry-run", action="store_true", help="Show plan, don't train") |
| parser.add_argument("--baseline", action="store_true", help="Only run baseline") |
| parser.add_argument("--time-budget", type=int, default=600, help="Time budget per run (s)") |
| parser.add_argument("--tps-floor", type=int, default=62000, help="Minimum acceptable TPS") |
| args = parser.parse_args() |
|
|
| state = load_state() |
| state["time_budget"] = args.time_budget |
| state["tps_floor"] = args.tps_floor |
|
|
| tested = set(state["mutations_tested"]) |
| remaining = [m for m in MUTATIONS if m["name"] not in tested] |
|
|
| print("=" * 70) |
| print("HYDRA AUTORESEARCH MUTATION LOOP") |
| print("=" * 70) |
| print(f"Time budget per run: {state['time_budget']}s") |
| print(f"TPS floor: {state['tps_floor']}") |
| print(f"Current gen: {state['current_gen']}") |
| print(f"Mutations tested: {len(tested)}/{len(MUTATIONS)}") |
| print(f"Mutations kept: {state['mutations_kept']}") |
| print(f"Remaining: {[m['name'] for m in remaining]}") |
| print() |
|
|
| if args.dry_run: |
| print("[DRY RUN] Would test these mutations in order:") |
| for i, m in enumerate(remaining): |
| print(f" {i + 1}. {m['name']} ({m['env']})") |
| return |
|
|
| |
| |
| |
| if state["baseline_quality"] is None: |
| print("[GEN 0] Running baseline training + evaluation...") |
| train_metrics = run_training(state["time_budget"]) |
| if train_metrics is None: |
| print("[FAIL] Baseline training failed") |
| save_state(state) |
| return |
|
|
| print("[GEN 0] Running quality evaluation...") |
| eval_metrics = run_eval_after_training() |
| if eval_metrics is None: |
| print("[FAIL] Baseline eval failed") |
| save_state(state) |
| return |
|
|
| baseline_tps = train_metrics.get("tps", 0) |
| baseline_quality = eval_metrics.get("quality_score", 0) |
|
|
| state["baseline_quality"] = baseline_quality |
| state["baseline_tps"] = baseline_tps |
| state["current_gen"] = 0 |
| state["history"].append({ |
| "gen": 0, |
| "mutation": "baseline", |
| "quality_score": baseline_quality, |
| "baseline_score": baseline_quality, |
| "delta": "0.0%", |
| "tps": baseline_tps, |
| "ppl": eval_metrics.get("ppl", 0), |
| "bleu4": eval_metrics.get("bleu4", 0), |
| "rouge_l": eval_metrics.get("rouge_l", 0), |
| "factual": eval_metrics.get("factual", 0), |
| "bpb": eval_metrics.get("bpb", 0), |
| "repetition_rate": eval_metrics.get("repetition_rate", 0), |
| "kept": True, |
| }) |
| save_state(state) |
| print(f"[GEN 0] BASELINE: quality={baseline_quality:.4f} tps={baseline_tps:.0f}") |
|
|
| if args.baseline: |
| return |
| else: |
| print(f"[RESUME] Baseline quality={state['baseline_quality']:.4f} tps={state['baseline_tps']:.0f}") |
| if args.baseline: |
| return |
|
|
| |
| |
| |
| current_quality = state["baseline_quality"] |
| |
| if state["history"]: |
| kept_entries = [h for h in state["history"] if h.get("kept")] |
| if kept_entries: |
| current_quality = kept_entries[-1]["quality_score"] |
|
|
| for mutation in remaining: |
| if _SHUTDOWN: |
| print("[AUTORESEARCH] Shutdown requested — saving state") |
| save_state(state) |
| return |
|
|
| gen = state["current_gen"] + 1 |
| name = mutation["name"] |
| env_str = mutation["env"] |
|
|
| print(f"\n[GEN {gen}] Testing {name} ({env_str})...") |
| print(f" Current best quality: {current_quality:.4f}") |
|
|
| |
| print(f" Training ({state['time_budget']}s)...", flush=True) |
| train_metrics = run_training(state["time_budget"], extra_env=env_str) |
| if train_metrics is None: |
| print(f" [SKIP] Training failed for {name}") |
| state["mutations_tested"].append(name) |
| state["current_gen"] = gen |
| state["history"].append({ |
| "gen": gen, "mutation": name, |
| "quality_score": 0, "baseline_score": current_quality, |
| "delta": "FAIL", "tps": 0, "ppl": 0, "bleu4": 0, |
| "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0, |
| "kept": False, |
| }) |
| save_state(state) |
| continue |
|
|
| tps = train_metrics.get("tps", 0) |
|
|
| |
| if tps < state["tps_floor"]: |
| print(f" [REJECT] TPS={tps:.0f} < floor={state['tps_floor']} — skipping eval") |
| state["mutations_tested"].append(name) |
| state["current_gen"] = gen |
| state["history"].append({ |
| "gen": gen, "mutation": name, |
| "quality_score": 0, "baseline_score": current_quality, |
| "delta": f"TPS_FAIL({tps:.0f})", "tps": tps, |
| "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0, |
| "bpb": train_metrics.get("val_bpb", 0), "repetition_rate": 0, |
| "kept": False, |
| }) |
| save_state(state) |
| continue |
|
|
| |
| print(f" Evaluating...", flush=True) |
| eval_metrics = run_eval_after_training(extra_env=env_str) |
| if eval_metrics is None: |
| print(f" [SKIP] Eval failed for {name}") |
| state["mutations_tested"].append(name) |
| state["current_gen"] = gen |
| state["history"].append({ |
| "gen": gen, "mutation": name, |
| "quality_score": 0, "baseline_score": current_quality, |
| "delta": "EVAL_FAIL", "tps": tps, "ppl": 0, "bleu4": 0, |
| "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0, |
| "kept": False, |
| }) |
| save_state(state) |
| continue |
|
|
| quality = eval_metrics.get("quality_score", 0) |
| delta_pct = ((quality - current_quality) / max(abs(current_quality), 1e-6)) * 100 |
| delta_str = f"{delta_pct:+.1f}%" |
|
|
| kept = quality > current_quality and tps >= state["tps_floor"] |
| status = "KEEP" if kept else "DISCARD" |
|
|
| entry = { |
| "gen": gen, |
| "mutation": name, |
| "quality_score": quality, |
| "baseline_score": current_quality, |
| "delta": delta_str, |
| "tps": tps, |
| "ppl": eval_metrics.get("ppl", 0), |
| "bleu4": eval_metrics.get("bleu4", 0), |
| "rouge_l": eval_metrics.get("rouge_l", 0), |
| "factual": eval_metrics.get("factual", 0), |
| "bpb": eval_metrics.get("bpb", 0), |
| "repetition_rate": eval_metrics.get("repetition_rate", 0), |
| "kept": kept, |
| } |
|
|
| print(f"\n[GEN {gen}] {name}: quality={quality:.4f} ({delta_str}) tps={tps:.0f} -> {status}") |
|
|
| if kept: |
| current_quality = quality |
| state["mutations_kept"].append(name) |
| git_commit(f"autoresearch: gen {gen} — {name} quality {delta_str}") |
|
|
| state["mutations_tested"].append(name) |
| state["current_gen"] = gen |
| state["history"].append(entry) |
| save_state(state) |
|
|
| |
| |
| |
| print("\n" + "=" * 70) |
| print("AUTORESEARCH COMPLETE") |
| print("=" * 70) |
| print(f"Total generations: {state['current_gen']}") |
| print(f"Mutations kept: {state['mutations_kept']}") |
| print(f"Final quality: {current_quality:.4f}") |
| if state["baseline_quality"]: |
| total_delta = ((current_quality - state["baseline_quality"]) / |
| max(abs(state["baseline_quality"]), 1e-6)) * 100 |
| print(f"Total improvement: {total_delta:+.1f}%") |
| print() |
|
|
| |
| print(f"{'Gen':>4} {'Mutation':>20} {'Quality':>8} {'Delta':>8} {'TPS':>7} {'PPL':>8} {'BPB':>7} {'Kept':>5}") |
| print("-" * 75) |
| for h in state["history"]: |
| print(f"{h['gen']:4d} {h['mutation']:>20s} {h['quality_score']:8.4f} " |
| f"{h['delta']:>8s} {h['tps']:7.0f} {h['ppl']:8.2f} " |
| f"{h.get('bpb', 0):7.4f} {' YES' if h['kept'] else ' NO'}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|