# -*- coding: utf-8 -*- """ make_property_glossary.py Summarize properties from AttackPlan JSONL and merge semantics from kb/specs/point_semantics.json. Usage: python scripts/make_property_glossary.py \ --src scripts/train_attackplan.filtered.jsonl \ --sem kb/specs/point_semantics.json """ from __future__ import annotations import argparse, json, statistics from pathlib import Path from collections import Counter, defaultdict def load_plans(src: Path): raw = src.read_text(encoding="utf-8-sig").splitlines() plans = [] for ln in raw: s = ln.strip() if not s: continue try: obj = json.loads(s) except Exception: continue if isinstance(obj, dict) and isinstance(obj.get("plan"), list): plans.append(obj) return plans def coerce_num(x): try: return float(str(x)) except Exception: return None def main(): ap = argparse.ArgumentParser() ap.add_argument("--src", required=True, help="AttackPlan JSONL (filtered)") ap.add_argument("--sem", required=True, help="Semantics JSON (kb/specs/point_semantics.json)") args = ap.parse_args() src = Path(args.src) sem = json.loads(Path(args.sem).read_text(encoding="utf-8")) props_sem = sem.get("properties", {}) plans = load_plans(src) if not plans: raise SystemExit(f"No AttackPlan rows in {src}") freq = Counter() vals = defaultdict(list) ops = Counter() applys = Counter() for plan in plans: for it in plan.get("plan", []): p = it.get("point","") freq[p]+=1 ops[it.get("op","set")] += 1 applys[(it.get("scope") or {}).get("apply","both")] += 1 v = it.get("attack_value", None) if v is not None: vals[p].append(v) lines = [] lines.append("# Property Glossary (auto-generated)\n") lines.append("**REALITY FILTER:** Items reflect your filtered dataset and semantics. If a property has low confidence in the semantics file, verify before relying on it.\n") lines.append(f"- Source file: `{src}`\n") lines.append(f"- Total plans: {len(plans)}\n") lines.append(f"- Operation distribution: {dict(ops)}\n") lines.append(f"- Scope.apply distribution: {dict(applys)}\n") lines.append("\n---\n## Properties\n") for prop, count in freq.most_common(): semp = props_sem.get(prop, {}) unit = semp.get("unit", "[n/a]") conf = semp.get("confidence", "unknown") notes = semp.get("notes", "") # example values ex_vals = vals[prop][:6] nums = [coerce_num(v) for v in vals[prop]] nums = [n for n in nums if n is not None] if nums: stats = f"min={min(nums):.3f}, p50={statistics.median(nums):.3f}, max={max(nums):.3f}" else: stats = "[n/a]" # confidence label conf_lbl = {"high":"[Verified]","medium":"[Inference]","low":"[Unverified]"}.get(conf, "[Unverified]") # write section lines.append(f"### `{prop}` \n- **count:** {count} \n- **unit:** {unit} \n- **confidence:** {conf_lbl} \n- **notes:** {notes or '[n/a]'} \n- **examples:** {ex_vals} \n- **numeric stats:** {stats}\n") out = Path("kb/cheatsheets/property_glossary.md") out.parent.mkdir(parents=True, exist_ok=True) out.write_text("\n".join(lines), encoding="utf-8") print("[ok] wrote", out.resolve()) if __name__ == "__main__": main()