"""Build the frozen 200-incident hold-out evaluation set. Run:: python -m eval.make_holdout --out data/holdout.jsonl This file is committed to the repo so reviewers can verify reported numbers byte-for-byte without rerunning the generator. The seeds used here are *outside* the SFT and GRPO seed bands declared in `tasks/registry.py` (seed_offset 1k-4k for training, 90k-94k here) so there is zero overlap between train and eval. Each record:: { "alert": {...}, "events": [...], "ground_truth": "", "triggering_log_id": "", "stage": "", "seed": } `eval/eval.py` consumes this format directly. """ from __future__ import annotations import argparse import json import os import sys from collections import Counter _HERE = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.dirname(_HERE)) from generator import generate_incident, make_alert # noqa: E402 from verifier import compute_ground_truth # noqa: E402 # Seed bands — kept distinct from training seed bands. HOLDOUT_SEED_BAND = { "stage1_basic": 90_000, "stage2_multi": 91_000, "stage3_mixed": 92_000, "stage4_adversarial": 93_000, } def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--n-per-stage", type=int, default=50, help="Number of incidents per stage (default 50 → 200 total).") parser.add_argument("--out", default="data/holdout.jsonl") args = parser.parse_args() out_path = os.path.join(os.path.dirname(_HERE), args.out) os.makedirs(os.path.dirname(out_path), exist_ok=True) counts: Counter = Counter() written = 0 with open(out_path, "w", encoding="utf-8") as f: for stage_id, base in HOLDOUT_SEED_BAND.items(): for i in range(args.n_per_stage): seed = base + i params = generate_incident(stage_id, seed) alert = make_alert(params, alert_id=f"A-EVAL-{stage_id[-1]}-{seed}") gt, sig = compute_ground_truth(params) rec = { "alert": alert.model_dump(mode="json"), "events": [e.model_dump(mode="json") for e in params.events], "ground_truth": gt.value, "triggering_log_id": sig.triggering_log_id or params.events[0].log_id, "stage": stage_id, "seed": seed, } f.write(json.dumps(rec) + "\n") counts[gt.value] += 1 written += 1 print(f"Wrote {written} hold-out incidents to {out_path}") print("Label distribution:") for k, v in sorted(counts.items()): print(f" {k:<18} {v:4d} ({100 * v / written:5.1f}%)") if __name__ == "__main__": main()