| """Tier-1 evaluation driver: baseline vs looped Laguna on knowledge-MC tasks. |
| |
| Runs a matrix of loop configs (baseline / layer / block / naive) over one or more |
| multiple-choice tasks and reports acc, acc_norm, and the delta vs baseline (pp). |
| |
| Real run (GPU): |
| uv run python scripts/tf_run_eval.py --model poolside/Laguna-XS.2 \ |
| --tasks arc_challenge,openbookqa --dtype bfloat16 --device cuda |
| |
| Local plumbing check (CPU, tiny random model + REAL tokenizer + REAL dataset): |
| uv run python scripts/tf_run_eval.py --tiny --tasks arc_challenge --limit 20 |
| |
| The --tiny path exercises the entire real code path (tokenizer, dataset loading, |
| scoring, config matrix) against a tiny random-weight model; metrics are meaningless |
| but it proves the GPU run is turn-key. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| import time |
| from datetime import datetime |
| from pathlib import Path |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| import torch |
|
|
| from looped_laguna import LoopConfig, load_model_and_tokenizer |
| from looped_laguna.eval import DATASET_LOADERS, format_results, run_matrix |
|
|
| DEFAULT_TOKENIZER = str(Path(__file__).resolve().parent.parent / "laguna_src") |
|
|
|
|
| def build_configs(num_layers: int, ks: list[int], width: int, center: float, controls: bool = True) -> dict: |
| """Config matrix, named `{mode}-{strategy}-K{k}` (mode x strategy is a clean 2x2). |
| |
| strategy: rk = damped K-stage Runge-Kutta (the proposed method) |
| naive = undamped x<-g(x) repeated K times (control, expected to collapse) |
| mode: layer = iterate each window layer in place (required for MoE) |
| block = iterate the whole window as a unit (control, routing thrash) |
| |
| We sweep K for the proposed config (layer-rk). With `controls=True` we also add the |
| other three corners of the 2x2 at K_max (block-rk isolates mode; layer-naive isolates |
| strategy; block-naive is the canonical "naive looped transformer"). Pass controls=False |
| for a clean K-sweep (baseline + layer-rk only) that doesn't duplicate the controls a |
| breadth run already covers. |
| """ |
| window = LoopConfig.from_depth_fraction(num_layers, center_frac=center, width=width).window |
| configs: dict[str, LoopConfig | None] = {"baseline": None} |
| for k in ks: |
| configs[f"layer-rk-K{k}"] = LoopConfig(window=window, K=k, mode="layer") |
| if controls: |
| k_max = max(ks) |
| configs[f"block-rk-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block") |
| configs[f"layer-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="layer", naive=True) |
| configs[f"block-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block", naive=True) |
| return configs |
|
|
|
|
|
|
|
|
| def main() -> None: |
| p = argparse.ArgumentParser(description="Tier-1 eval: baseline vs looped Laguna on knowledge-MC tasks.") |
| p.add_argument("--model", default="poolside/Laguna-XS.2", |
| help="HF model id or local path of the real model (ignored with --tiny)") |
| p.add_argument("--tokenizer", default=DEFAULT_TOKENIZER, |
| help="tokenizer path/id (default: the vendored laguna_src)") |
| p.add_argument("--tasks", default="arc_challenge", |
| help="comma-separated task names (see DATASET_LOADERS in eval.py)") |
| p.add_argument("--limit", type=int, default=None, help="max examples per task (default: all)") |
| p.add_argument("--ks", default="2,3", help="comma-separated loop counts K to sweep for layer-rk") |
| p.add_argument("--no-controls", dest="controls", action="store_false", |
| help="only run baseline + layer-rk per K (skip block/naive controls), e.g. for a K-sweep") |
| p.add_argument("--width", type=int, default=4, help="loop window width in layers") |
| p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)") |
| p.add_argument("--dtype", default="bfloat16", help="model dtype: bfloat16/float16/float32") |
| p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto") |
| p.add_argument("--batch-size", type=int, default=16, |
| help="padded-batch size for scoring (bump to 32-64 on big GPUs; lower for long tasks)") |
| p.add_argument("--tiny", action="store_true", |
| help="use a tiny random-weight model instead of real weights (plumbing check)") |
| p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model") |
| p.add_argument("--output", default=None, |
| help="results JSON path (default: auto-named results_eval_<timestamp>.json)") |
| p.add_argument("--no-save", action="store_true", help="do not write results to disk") |
| p.add_argument("--no-peritem", dest="peritem", action="store_false", |
| help="don't save per-item raw data (per-choice LLs/gold/subject) for re-aggregation") |
| args = p.parse_args() |
|
|
| ks = [int(x) for x in args.ks.split(",")] |
| tasks = [t.strip() for t in args.tasks.split(",")] |
| for t in tasks: |
| if t not in DATASET_LOADERS: |
| raise SystemExit(f"unknown task {t!r}; known: {sorted(DATASET_LOADERS)}") |
|
|
| model, tokenizer = load_model_and_tokenizer( |
| args.model, args.tokenizer, dtype=args.dtype, device=args.device, |
| tiny=args.tiny, tiny_layers=args.tiny_layers, |
| ) |
| num_layers = model.config.num_hidden_layers |
| configs = build_configs(num_layers, ks, args.width, args.center, controls=args.controls) |
| window = next(c.window for c in configs.values() if c is not None) |
| print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}") |
| print(f"configs: {list(configs)}") |
|
|
| |
| save_path = None if args.no_save else ( |
| args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json" |
| ) |
| meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers, |
| "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit} |
| all_results: dict = {} |
|
|
| |
| peritem_path = str(Path(save_path).with_suffix("")) + "_peritem.jsonl" if (save_path and args.peritem) else None |
|
|
| def save() -> None: |
| if save_path: |
| Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2)) |
|
|
| if save_path: |
| print(f"saving results to {save_path} (updated after every config)") |
| if peritem_path: |
| Path(peritem_path).write_text("") |
| print(f"saving per-item data to {peritem_path}") |
|
|
| for task in tasks: |
| examples = DATASET_LOADERS[task](limit=args.limit) |
| print(f"\n=== {task} ({len(examples)} items) ===") |
| all_results[task] = {} |
| t0 = time.time() |
| results = run_matrix( |
| model, tokenizer, examples, configs, batch_size=args.batch_size, |
| peritem_path=peritem_path, task_name=task, |
| on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()), |
| ) |
| all_results[task] = results |
| save() |
| for line in format_results(results): |
| print(line) |
| print(f" ({time.time() - t0:.1f}s)") |
|
|
| if save_path: |
| print(f"\nresults saved to {save_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|