"""Tier-1 evaluation driver: baseline vs looped Laguna on knowledge-MC tasks. Runs a matrix of loop configs (baseline / layer / block / naive) over one or more multiple-choice tasks and reports acc, acc_norm, and the delta vs baseline (pp). Real run (GPU): uv run python scripts/tf_run_eval.py --model poolside/Laguna-XS.2 \ --tasks arc_challenge,openbookqa --dtype bfloat16 --device cuda Local plumbing check (CPU, tiny random model + REAL tokenizer + REAL dataset): uv run python scripts/tf_run_eval.py --tiny --tasks arc_challenge --limit 20 The --tiny path exercises the entire real code path (tokenizer, dataset loading, scoring, config matrix) against a tiny random-weight model; metrics are meaningless but it proves the GPU run is turn-key. """ from __future__ import annotations import argparse import json import sys import time from datetime import datetime from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) import torch from looped_laguna import LoopConfig, load_model_and_tokenizer from looped_laguna.eval import DATASET_LOADERS, format_results, run_matrix DEFAULT_TOKENIZER = str(Path(__file__).resolve().parent.parent / "laguna_src") def build_configs(num_layers: int, ks: list[int], width: int, center: float, controls: bool = True) -> dict: """Config matrix, named `{mode}-{strategy}-K{k}` (mode x strategy is a clean 2x2). strategy: rk = damped K-stage Runge-Kutta (the proposed method) naive = undamped x<-g(x) repeated K times (control, expected to collapse) mode: layer = iterate each window layer in place (required for MoE) block = iterate the whole window as a unit (control, routing thrash) We sweep K for the proposed config (layer-rk). With `controls=True` we also add the other three corners of the 2x2 at K_max (block-rk isolates mode; layer-naive isolates strategy; block-naive is the canonical "naive looped transformer"). Pass controls=False for a clean K-sweep (baseline + layer-rk only) that doesn't duplicate the controls a breadth run already covers. """ window = LoopConfig.from_depth_fraction(num_layers, center_frac=center, width=width).window configs: dict[str, LoopConfig | None] = {"baseline": None} for k in ks: configs[f"layer-rk-K{k}"] = LoopConfig(window=window, K=k, mode="layer") if controls: k_max = max(ks) configs[f"block-rk-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block") configs[f"layer-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="layer", naive=True) configs[f"block-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block", naive=True) return configs def main() -> None: p = argparse.ArgumentParser(description="Tier-1 eval: baseline vs looped Laguna on knowledge-MC tasks.") p.add_argument("--model", default="poolside/Laguna-XS.2", help="HF model id or local path of the real model (ignored with --tiny)") p.add_argument("--tokenizer", default=DEFAULT_TOKENIZER, help="tokenizer path/id (default: the vendored laguna_src)") p.add_argument("--tasks", default="arc_challenge", help="comma-separated task names (see DATASET_LOADERS in eval.py)") p.add_argument("--limit", type=int, default=None, help="max examples per task (default: all)") p.add_argument("--ks", default="2,3", help="comma-separated loop counts K to sweep for layer-rk") p.add_argument("--no-controls", dest="controls", action="store_false", help="only run baseline + layer-rk per K (skip block/naive controls), e.g. for a K-sweep") p.add_argument("--width", type=int, default=4, help="loop window width in layers") p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)") p.add_argument("--dtype", default="bfloat16", help="model dtype: bfloat16/float16/float32") p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto") p.add_argument("--batch-size", type=int, default=16, help="padded-batch size for scoring (bump to 32-64 on big GPUs; lower for long tasks)") p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model instead of real weights (plumbing check)") p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model") p.add_argument("--output", default=None, help="results JSON path (default: auto-named results_eval_.json)") p.add_argument("--no-save", action="store_true", help="do not write results to disk") p.add_argument("--no-peritem", dest="peritem", action="store_false", help="don't save per-item raw data (per-choice LLs/gold/subject) for re-aggregation") args = p.parse_args() ks = [int(x) for x in args.ks.split(",")] tasks = [t.strip() for t in args.tasks.split(",")] for t in tasks: if t not in DATASET_LOADERS: raise SystemExit(f"unknown task {t!r}; known: {sorted(DATASET_LOADERS)}") model, tokenizer = load_model_and_tokenizer( args.model, args.tokenizer, dtype=args.dtype, device=args.device, tiny=args.tiny, tiny_layers=args.tiny_layers, ) num_layers = model.config.num_hidden_layers configs = build_configs(num_layers, ks, args.width, args.center, controls=args.controls) window = next(c.window for c in configs.values() if c is not None) print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}") print(f"configs: {list(configs)}") # Always persist (incrementally) unless --no-save, so a crash never loses finished work. save_path = None if args.no_save else ( args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json" ) meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers, "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit} all_results: dict = {} # Per-item raw data (per-choice LLs + gold/subject/lengths) for later re-aggregation. peritem_path = str(Path(save_path).with_suffix("")) + "_peritem.jsonl" if (save_path and args.peritem) else None def save() -> None: if save_path: Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2)) if save_path: print(f"saving results to {save_path} (updated after every config)") if peritem_path: Path(peritem_path).write_text("") # truncate; run_matrix appends one line per task print(f"saving per-item data to {peritem_path}") for task in tasks: examples = DATASET_LOADERS[task](limit=args.limit) print(f"\n=== {task} ({len(examples)} items) ===") all_results[task] = {} # partial configs land here as they finish t0 = time.time() results = run_matrix( model, tokenizer, examples, configs, batch_size=args.batch_size, peritem_path=peritem_path, task_name=task, on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()), ) all_results[task] = results # full results, now with deltas + significance save() for line in format_results(results): print(line) print(f" ({time.time() - t0:.1f}s)") if save_path: print(f"\nresults saved to {save_path}") if __name__ == "__main__": main()