| """Build the frozen 200-incident hold-out evaluation set. |
| |
| Run:: |
| |
| python -m eval.make_holdout --out data/holdout.jsonl |
| |
| This file is committed to the repo so reviewers can verify reported |
| numbers byte-for-byte without rerunning the generator. The seeds used |
| here are *outside* the SFT and GRPO seed bands declared in |
| `tasks/registry.py` (seed_offset 1k-4k for training, 90k-94k here) so |
| there is zero overlap between train and eval. |
| |
| Each record:: |
| |
| { "alert": {...}, "events": [...], "ground_truth": "<action>", |
| "triggering_log_id": "<id>", "stage": "<stage>", "seed": <int> } |
| |
| `eval/eval.py` consumes this format directly. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import sys |
| from collections import Counter |
|
|
| _HERE = os.path.dirname(os.path.abspath(__file__)) |
| sys.path.insert(0, os.path.dirname(_HERE)) |
|
|
| from generator import generate_incident, make_alert |
| from verifier import compute_ground_truth |
|
|
|
|
| |
| HOLDOUT_SEED_BAND = { |
| "stage1_basic": 90_000, |
| "stage2_multi": 91_000, |
| "stage3_mixed": 92_000, |
| "stage4_adversarial": 93_000, |
| } |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--n-per-stage", type=int, default=50, |
| help="Number of incidents per stage (default 50 → 200 total).") |
| parser.add_argument("--out", default="data/holdout.jsonl") |
| args = parser.parse_args() |
|
|
| out_path = os.path.join(os.path.dirname(_HERE), args.out) |
| os.makedirs(os.path.dirname(out_path), exist_ok=True) |
|
|
| counts: Counter = Counter() |
| written = 0 |
| with open(out_path, "w", encoding="utf-8") as f: |
| for stage_id, base in HOLDOUT_SEED_BAND.items(): |
| for i in range(args.n_per_stage): |
| seed = base + i |
| params = generate_incident(stage_id, seed) |
| alert = make_alert(params, alert_id=f"A-EVAL-{stage_id[-1]}-{seed}") |
| gt, sig = compute_ground_truth(params) |
| rec = { |
| "alert": alert.model_dump(mode="json"), |
| "events": [e.model_dump(mode="json") for e in params.events], |
| "ground_truth": gt.value, |
| "triggering_log_id": sig.triggering_log_id or params.events[0].log_id, |
| "stage": stage_id, |
| "seed": seed, |
| } |
| f.write(json.dumps(rec) + "\n") |
| counts[gt.value] += 1 |
| written += 1 |
| print(f"Wrote {written} hold-out incidents to {out_path}") |
| print("Label distribution:") |
| for k, v in sorted(counts.items()): |
| print(f" {k:<18} {v:4d} ({100 * v / written:5.1f}%)") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|