Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """HYDRA Autoresearch Mutation Loop. | |
| Runs baseline training -> evaluates -> picks ONE mutation at a time -> | |
| trains -> evaluates -> keeps if quality improves AND tps >= floor. | |
| Repeats until all mutations exhausted or Ctrl+C. | |
| State persisted in .omc/autoresearch_config.json for resume support. | |
| Usage: | |
| python scripts/autoresearch.py # run full loop | |
| python scripts/autoresearch.py --dry-run # show plan, don't train | |
| python scripts/autoresearch.py --baseline # only run baseline eval | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import math | |
| import os | |
| import re | |
| import signal | |
| import subprocess | |
| import sys | |
| import time | |
| from pathlib import Path | |
| _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if _PROJECT_ROOT not in sys.path: | |
| sys.path.insert(0, _PROJECT_ROOT) | |
| # --------------------------------------------------------------------------- | |
| # Mutation catalog (ordered by expected impact) | |
| # --------------------------------------------------------------------------- | |
| MUTATIONS = [ | |
| # Learning dynamics — env vars verified in hydra/config.py | |
| {"name": "lr_matrix_0.012", "env": "HYDRA_MATRIX_LR=0.012"}, # default 0.12 | |
| {"name": "lr_matrix_0.06", "env": "HYDRA_MATRIX_LR=0.06"}, # half default | |
| {"name": "lr_matrix_0.24", "env": "HYDRA_MATRIX_LR=0.24"}, # double default | |
| {"name": "lr_floor_50pct", "env": "HYDRA_LR_MIN_MULT=0.5"}, # default 0.0 | |
| {"name": "lr_floor_20pct", "env": "HYDRA_LR_MIN_MULT=0.2"}, # default 0.0 | |
| {"name": "embed_lr_0.5", "env": "HYDRA_EMBED_LR=0.5"}, # default 1.0 | |
| {"name": "embed_lr_2.0", "env": "HYDRA_EMBED_LR=2.0"}, # default 1.0 | |
| {"name": "unembed_lr_0.01", "env": "HYDRA_UNEMBED_LR=0.01"}, # default 0.005 | |
| # Architecture — env vars verified in hydra/config.py | |
| {"name": "d_model_384", "env": "HYDRA_D_MODEL=384"}, # default 256 | |
| {"name": "d_model_192", "env": "HYDRA_D_MODEL=192"}, # smaller | |
| {"name": "d_state_128", "env": "HYDRA_D_STATE=128"}, # default 64 | |
| {"name": "d_state_32", "env": "HYDRA_D_STATE=32"}, # smaller | |
| {"name": "n_layer_6", "env": "HYDRA_N_LAYER=6"}, # default 4 | |
| {"name": "n_layer_3", "env": "HYDRA_N_LAYER=3"}, # fewer | |
| {"name": "headdim_16", "env": "HYDRA_HEADDIM=16"}, # default 32 -> more heads | |
| {"name": "headdim_64", "env": "HYDRA_HEADDIM=64"}, # default 32 -> fewer heads | |
| {"name": "expand_3", "env": "HYDRA_EXPAND=3"}, # default 2 | |
| {"name": "engram_2048", "env": "HYDRA_ENGRAM_N_COLUMNS=2048"}, # default 1024 | |
| {"name": "engram_4096", "env": "HYDRA_ENGRAM_N_COLUMNS=4096"}, # default 1024 | |
| {"name": "engram_512", "env": "HYDRA_ENGRAM_N_COLUMNS=512"}, # smaller | |
| # Batch size | |
| {"name": "batch_32k", "env": "HYDRA_TOTAL_BATCH=32768"}, # default 32768 (verify) | |
| {"name": "batch_16k", "env": "HYDRA_TOTAL_BATCH=16384"}, # smaller batch | |
| {"name": "batch_65k", "env": "HYDRA_TOTAL_BATCH=65536"}, # larger batch | |
| # Regularization — env vars verified in hydra/model.py + hydra/config.py | |
| {"name": "dropout_0.05", "env": "HYDRA_DROPOUT=0.05"}, # default 0.2 | |
| {"name": "dropout_0.1", "env": "HYDRA_DROPOUT=0.1"}, # default 0.2 | |
| {"name": "dropout_0.3", "env": "HYDRA_DROPOUT=0.3"}, # higher | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # State management | |
| # --------------------------------------------------------------------------- | |
| STATE_DIR = os.path.join(_PROJECT_ROOT, ".omc") | |
| STATE_FILE = os.path.join(STATE_DIR, "autoresearch_config.json") | |
| DEFAULT_STATE = { | |
| "baseline_quality": None, | |
| "baseline_tps": None, | |
| "current_gen": 0, | |
| "mutations_tested": [], | |
| "mutations_kept": [], | |
| "tps_floor": 62000, | |
| "time_budget": 600, | |
| "history": [], | |
| } | |
| def load_state() -> dict: | |
| """Load state from disk or return default.""" | |
| if os.path.exists(STATE_FILE): | |
| with open(STATE_FILE, "r") as f: | |
| state = json.load(f) | |
| # Backfill missing keys from defaults | |
| for k, v in DEFAULT_STATE.items(): | |
| if k not in state: | |
| state[k] = v | |
| return state | |
| return dict(DEFAULT_STATE) | |
| def save_state(state: dict) -> None: | |
| """Persist state to disk.""" | |
| os.makedirs(STATE_DIR, exist_ok=True) | |
| with open(STATE_FILE, "w") as f: | |
| json.dump(state, f, indent=2) | |
| # --------------------------------------------------------------------------- | |
| # Training subprocess | |
| # --------------------------------------------------------------------------- | |
| def build_env(extra_env: str | None = None) -> dict[str, str]: | |
| """Build environment for training subprocess.""" | |
| env = os.environ.copy() | |
| # Ensure CUDA paths | |
| ld_paths = ["/usr/lib/wsl/lib", "/usr/local/cuda/lib64"] | |
| existing = env.get("LD_LIBRARY_PATH", "") | |
| for p in ld_paths: | |
| if p not in existing: | |
| existing = p + ":" + existing | |
| env["LD_LIBRARY_PATH"] = existing | |
| # Apply mutation env var | |
| if extra_env: | |
| key, val = extra_env.split("=", 1) | |
| env[key] = val | |
| return env | |
| def run_training(time_budget: int, extra_env: str | None = None) -> dict | None: | |
| """Run train.py with given time budget and optional env override. | |
| Returns dict with parsed metrics, or None on failure. | |
| """ | |
| env = build_env(extra_env) | |
| env["HYDRA_TIME_BUDGET"] = str(time_budget) | |
| cmd = [os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), "-u", "train.py"] | |
| try: | |
| proc = subprocess.Popen( | |
| cmd, | |
| cwd=_PROJECT_ROOT, | |
| env=env, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, | |
| bufsize=1, | |
| ) | |
| except Exception as e: | |
| print(f" [ERROR] Failed to start training: {e}") | |
| return None | |
| output_lines: list[str] = [] | |
| last_step_line = "" | |
| try: | |
| for line in proc.stdout: | |
| line = line.rstrip() | |
| output_lines.append(line) | |
| if line.startswith("step="): | |
| last_step_line = line | |
| # Print progress every 50 steps | |
| m = re.search(r"step=(\d+)", line) | |
| if m and int(m.group(1)) % 50 == 0: | |
| tps_m = re.search(r"tps=(\d+)", line) | |
| bpb_m = re.search(r"bpb=([\d.]+)", line) | |
| tps = tps_m.group(1) if tps_m else "?" | |
| bpb = bpb_m.group(1) if bpb_m else "?" | |
| print(f" step={m.group(1)} tps={tps} bpb={bpb}", flush=True) | |
| elif "val_bpb" in line or "factual_english_score" in line: | |
| print(f" {line}", flush=True) | |
| except KeyboardInterrupt: | |
| proc.terminate() | |
| proc.wait() | |
| raise | |
| proc.wait() | |
| if proc.returncode != 0: | |
| print(f" [ERROR] Training exited with code {proc.returncode}") | |
| # Print last 10 lines for debugging | |
| for line in output_lines[-10:]: | |
| print(f" {line}") | |
| return None | |
| return _parse_training_output(output_lines) | |
| def _parse_training_output(lines: list[str]) -> dict: | |
| """Extract metrics from training output lines.""" | |
| metrics: dict[str, float] = {} | |
| for line in lines: | |
| # Key=value pairs from summary block | |
| for key in ["val_bpb", "training_seconds", "peak_vram_mb", "mfu_percent", | |
| "total_tokens_M", "num_steps", "factual_english_score", | |
| "factual_english_hits"]: | |
| m = re.match(rf"^{key}:\s+([\d.]+)", line.strip()) | |
| if m: | |
| metrics[key] = float(m.group(1)) | |
| # TPS from last step line | |
| if line.startswith("step="): | |
| tps_m = re.search(r"tps=(\d+)", line) | |
| if tps_m: | |
| metrics["tps"] = float(tps_m.group(1)) | |
| return metrics | |
| # --------------------------------------------------------------------------- | |
| # Eval integration | |
| # --------------------------------------------------------------------------- | |
| def run_eval_after_training(extra_env: str | None = None) -> dict | None: | |
| """Run eval_quality.py after training. Returns metrics dict or None.""" | |
| env = build_env(extra_env) | |
| cmd = [ | |
| os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), | |
| os.path.join(_PROJECT_ROOT, "scripts", "eval_quality.py"), | |
| ] | |
| try: | |
| result = subprocess.run( | |
| cmd, | |
| cwd=_PROJECT_ROOT, | |
| env=env, | |
| capture_output=True, | |
| text=True, | |
| timeout=120, # 2 min max for eval | |
| ) | |
| except subprocess.TimeoutExpired: | |
| print(" [ERROR] Eval timed out (120s)") | |
| return None | |
| except Exception as e: | |
| print(f" [ERROR] Eval failed: {e}") | |
| return None | |
| if result.returncode != 0: | |
| print(f" [ERROR] Eval exited with code {result.returncode}") | |
| for line in result.stdout.split("\n")[-10:]: | |
| print(f" {line}") | |
| for line in result.stderr.split("\n")[-5:]: | |
| print(f" {line}") | |
| return None | |
| # Parse key=value output | |
| metrics = {} | |
| for line in result.stdout.split("\n"): | |
| line = line.strip() | |
| m = re.match(r"^([\w]+)=([\d.eE+-]+)$", line) | |
| if m: | |
| try: | |
| metrics[m.group(1)] = float(m.group(2)) | |
| except ValueError: | |
| pass | |
| return metrics if metrics else None | |
| # --------------------------------------------------------------------------- | |
| # Git operations | |
| # --------------------------------------------------------------------------- | |
| def git_commit(message: str) -> bool: | |
| """Stage all changes and commit.""" | |
| try: | |
| subprocess.run(["git", "add", "-A"], cwd=_PROJECT_ROOT, check=True, | |
| capture_output=True, timeout=30) | |
| subprocess.run( | |
| ["git", "commit", "-m", message], | |
| cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30, | |
| ) | |
| return True | |
| except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: | |
| print(f" [WARN] Git commit failed: {e}") | |
| return False | |
| # --------------------------------------------------------------------------- | |
| # Main loop | |
| # --------------------------------------------------------------------------- | |
| _SHUTDOWN = False | |
| def _handle_sigint(signum, frame): | |
| global _SHUTDOWN | |
| if _SHUTDOWN: | |
| print("\n[AUTORESEARCH] Double Ctrl+C — force exit") | |
| sys.exit(1) | |
| _SHUTDOWN = True | |
| print("\n[AUTORESEARCH] Ctrl+C received — finishing current gen then saving state...") | |
| def main(): | |
| global _SHUTDOWN | |
| signal.signal(signal.SIGINT, _handle_sigint) | |
| parser = argparse.ArgumentParser(description="HYDRA autoresearch mutation loop") | |
| parser.add_argument("--dry-run", action="store_true", help="Show plan, don't train") | |
| parser.add_argument("--baseline", action="store_true", help="Only run baseline") | |
| parser.add_argument("--time-budget", type=int, default=600, help="Time budget per run (s)") | |
| parser.add_argument("--tps-floor", type=int, default=62000, help="Minimum acceptable TPS") | |
| args = parser.parse_args() | |
| state = load_state() | |
| state["time_budget"] = args.time_budget | |
| state["tps_floor"] = args.tps_floor | |
| tested = set(state["mutations_tested"]) | |
| remaining = [m for m in MUTATIONS if m["name"] not in tested] | |
| print("=" * 70) | |
| print("HYDRA AUTORESEARCH MUTATION LOOP") | |
| print("=" * 70) | |
| print(f"Time budget per run: {state['time_budget']}s") | |
| print(f"TPS floor: {state['tps_floor']}") | |
| print(f"Current gen: {state['current_gen']}") | |
| print(f"Mutations tested: {len(tested)}/{len(MUTATIONS)}") | |
| print(f"Mutations kept: {state['mutations_kept']}") | |
| print(f"Remaining: {[m['name'] for m in remaining]}") | |
| print() | |
| if args.dry_run: | |
| print("[DRY RUN] Would test these mutations in order:") | |
| for i, m in enumerate(remaining): | |
| print(f" {i + 1}. {m['name']} ({m['env']})") | |
| return | |
| # ----------------------------------------------------------------------- | |
| # Baseline (Gen 0) | |
| # ----------------------------------------------------------------------- | |
| if state["baseline_quality"] is None: | |
| print("[GEN 0] Running baseline training + evaluation...") | |
| train_metrics = run_training(state["time_budget"]) | |
| if train_metrics is None: | |
| print("[FAIL] Baseline training failed") | |
| save_state(state) | |
| return | |
| print("[GEN 0] Running quality evaluation...") | |
| eval_metrics = run_eval_after_training() | |
| if eval_metrics is None: | |
| print("[FAIL] Baseline eval failed") | |
| save_state(state) | |
| return | |
| baseline_tps = train_metrics.get("tps", 0) | |
| baseline_quality = eval_metrics.get("quality_score", 0) | |
| state["baseline_quality"] = baseline_quality | |
| state["baseline_tps"] = baseline_tps | |
| state["current_gen"] = 0 | |
| state["history"].append({ | |
| "gen": 0, | |
| "mutation": "baseline", | |
| "quality_score": baseline_quality, | |
| "baseline_score": baseline_quality, | |
| "delta": "0.0%", | |
| "tps": baseline_tps, | |
| "ppl": eval_metrics.get("ppl", 0), | |
| "bleu4": eval_metrics.get("bleu4", 0), | |
| "rouge_l": eval_metrics.get("rouge_l", 0), | |
| "factual": eval_metrics.get("factual", 0), | |
| "bpb": eval_metrics.get("bpb", 0), | |
| "repetition_rate": eval_metrics.get("repetition_rate", 0), | |
| "kept": True, | |
| }) | |
| save_state(state) | |
| print(f"[GEN 0] BASELINE: quality={baseline_quality:.4f} tps={baseline_tps:.0f}") | |
| if args.baseline: | |
| return | |
| else: | |
| print(f"[RESUME] Baseline quality={state['baseline_quality']:.4f} tps={state['baseline_tps']:.0f}") | |
| if args.baseline: | |
| return | |
| # ----------------------------------------------------------------------- | |
| # Mutation loop | |
| # ----------------------------------------------------------------------- | |
| current_quality = state["baseline_quality"] | |
| # Track best quality so far (from last kept mutation, not just baseline) | |
| if state["history"]: | |
| kept_entries = [h for h in state["history"] if h.get("kept")] | |
| if kept_entries: | |
| current_quality = kept_entries[-1]["quality_score"] | |
| for mutation in remaining: | |
| if _SHUTDOWN: | |
| print("[AUTORESEARCH] Shutdown requested — saving state") | |
| save_state(state) | |
| return | |
| gen = state["current_gen"] + 1 | |
| name = mutation["name"] | |
| env_str = mutation["env"] | |
| print(f"\n[GEN {gen}] Testing {name} ({env_str})...") | |
| print(f" Current best quality: {current_quality:.4f}") | |
| # Train with mutation | |
| print(f" Training ({state['time_budget']}s)...", flush=True) | |
| train_metrics = run_training(state["time_budget"], extra_env=env_str) | |
| if train_metrics is None: | |
| print(f" [SKIP] Training failed for {name}") | |
| state["mutations_tested"].append(name) | |
| state["current_gen"] = gen | |
| state["history"].append({ | |
| "gen": gen, "mutation": name, | |
| "quality_score": 0, "baseline_score": current_quality, | |
| "delta": "FAIL", "tps": 0, "ppl": 0, "bleu4": 0, | |
| "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0, | |
| "kept": False, | |
| }) | |
| save_state(state) | |
| continue | |
| tps = train_metrics.get("tps", 0) | |
| # TPS floor check | |
| if tps < state["tps_floor"]: | |
| print(f" [REJECT] TPS={tps:.0f} < floor={state['tps_floor']} — skipping eval") | |
| state["mutations_tested"].append(name) | |
| state["current_gen"] = gen | |
| state["history"].append({ | |
| "gen": gen, "mutation": name, | |
| "quality_score": 0, "baseline_score": current_quality, | |
| "delta": f"TPS_FAIL({tps:.0f})", "tps": tps, | |
| "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0, | |
| "bpb": train_metrics.get("val_bpb", 0), "repetition_rate": 0, | |
| "kept": False, | |
| }) | |
| save_state(state) | |
| continue | |
| # Evaluate | |
| print(f" Evaluating...", flush=True) | |
| eval_metrics = run_eval_after_training(extra_env=env_str) | |
| if eval_metrics is None: | |
| print(f" [SKIP] Eval failed for {name}") | |
| state["mutations_tested"].append(name) | |
| state["current_gen"] = gen | |
| state["history"].append({ | |
| "gen": gen, "mutation": name, | |
| "quality_score": 0, "baseline_score": current_quality, | |
| "delta": "EVAL_FAIL", "tps": tps, "ppl": 0, "bleu4": 0, | |
| "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0, | |
| "kept": False, | |
| }) | |
| save_state(state) | |
| continue | |
| quality = eval_metrics.get("quality_score", 0) | |
| delta_pct = ((quality - current_quality) / max(abs(current_quality), 1e-6)) * 100 | |
| delta_str = f"{delta_pct:+.1f}%" | |
| kept = quality > current_quality and tps >= state["tps_floor"] | |
| status = "KEEP" if kept else "DISCARD" | |
| entry = { | |
| "gen": gen, | |
| "mutation": name, | |
| "quality_score": quality, | |
| "baseline_score": current_quality, | |
| "delta": delta_str, | |
| "tps": tps, | |
| "ppl": eval_metrics.get("ppl", 0), | |
| "bleu4": eval_metrics.get("bleu4", 0), | |
| "rouge_l": eval_metrics.get("rouge_l", 0), | |
| "factual": eval_metrics.get("factual", 0), | |
| "bpb": eval_metrics.get("bpb", 0), | |
| "repetition_rate": eval_metrics.get("repetition_rate", 0), | |
| "kept": kept, | |
| } | |
| print(f"\n[GEN {gen}] {name}: quality={quality:.4f} ({delta_str}) tps={tps:.0f} -> {status}") | |
| if kept: | |
| current_quality = quality | |
| state["mutations_kept"].append(name) | |
| git_commit(f"autoresearch: gen {gen} — {name} quality {delta_str}") | |
| state["mutations_tested"].append(name) | |
| state["current_gen"] = gen | |
| state["history"].append(entry) | |
| save_state(state) | |
| # ----------------------------------------------------------------------- | |
| # Summary | |
| # ----------------------------------------------------------------------- | |
| print("\n" + "=" * 70) | |
| print("AUTORESEARCH COMPLETE") | |
| print("=" * 70) | |
| print(f"Total generations: {state['current_gen']}") | |
| print(f"Mutations kept: {state['mutations_kept']}") | |
| print(f"Final quality: {current_quality:.4f}") | |
| if state["baseline_quality"]: | |
| total_delta = ((current_quality - state["baseline_quality"]) / | |
| max(abs(state["baseline_quality"]), 1e-6)) * 100 | |
| print(f"Total improvement: {total_delta:+.1f}%") | |
| print() | |
| # Print history table | |
| print(f"{'Gen':>4} {'Mutation':>20} {'Quality':>8} {'Delta':>8} {'TPS':>7} {'PPL':>8} {'BPB':>7} {'Kept':>5}") | |
| print("-" * 75) | |
| for h in state["history"]: | |
| print(f"{h['gen']:4d} {h['mutation']:>20s} {h['quality_score']:8.4f} " | |
| f"{h['delta']:>8s} {h['tps']:7.0f} {h['ppl']:8.2f} " | |
| f"{h.get('bpb', 0):7.4f} {' YES' if h['kept'] else ' NO'}") | |
| if __name__ == "__main__": | |
| main() | |