| |
|
| | """
|
| | make_property_glossary.py
|
| | Summarize properties from AttackPlan JSONL and merge semantics from kb/specs/point_semantics.json.
|
| |
|
| | Usage:
|
| | python scripts/make_property_glossary.py \
|
| | --src scripts/train_attackplan.filtered.jsonl \
|
| | --sem kb/specs/point_semantics.json
|
| | """
|
| | from __future__ import annotations
|
| | import argparse, json, statistics
|
| | from pathlib import Path
|
| | from collections import Counter, defaultdict
|
| |
|
| | def load_plans(src: Path):
|
| | raw = src.read_text(encoding="utf-8-sig").splitlines()
|
| | plans = []
|
| | for ln in raw:
|
| | s = ln.strip()
|
| | if not s: continue
|
| | try:
|
| | obj = json.loads(s)
|
| | except Exception:
|
| | continue
|
| | if isinstance(obj, dict) and isinstance(obj.get("plan"), list):
|
| | plans.append(obj)
|
| | return plans
|
| |
|
| | def coerce_num(x):
|
| | try: return float(str(x))
|
| | except Exception: return None
|
| |
|
| | def main():
|
| | ap = argparse.ArgumentParser()
|
| | ap.add_argument("--src", required=True, help="AttackPlan JSONL (filtered)")
|
| | ap.add_argument("--sem", required=True, help="Semantics JSON (kb/specs/point_semantics.json)")
|
| | args = ap.parse_args()
|
| |
|
| | src = Path(args.src)
|
| | sem = json.loads(Path(args.sem).read_text(encoding="utf-8"))
|
| | props_sem = sem.get("properties", {})
|
| |
|
| | plans = load_plans(src)
|
| | if not plans:
|
| | raise SystemExit(f"No AttackPlan rows in {src}")
|
| |
|
| | freq = Counter()
|
| | vals = defaultdict(list)
|
| | ops = Counter()
|
| | applys = Counter()
|
| |
|
| | for plan in plans:
|
| | for it in plan.get("plan", []):
|
| | p = it.get("point","")
|
| | freq[p]+=1
|
| | ops[it.get("op","set")] += 1
|
| | applys[(it.get("scope") or {}).get("apply","both")] += 1
|
| | v = it.get("attack_value", None)
|
| | if v is not None: vals[p].append(v)
|
| |
|
| | lines = []
|
| | lines.append("# Property Glossary (auto-generated)\n")
|
| | lines.append("**REALITY FILTER:** Items reflect your filtered dataset and semantics. If a property has low confidence in the semantics file, verify before relying on it.\n")
|
| | lines.append(f"- Source file: `{src}`\n")
|
| | lines.append(f"- Total plans: {len(plans)}\n")
|
| | lines.append(f"- Operation distribution: {dict(ops)}\n")
|
| | lines.append(f"- Scope.apply distribution: {dict(applys)}\n")
|
| | lines.append("\n---\n## Properties\n")
|
| |
|
| | for prop, count in freq.most_common():
|
| | semp = props_sem.get(prop, {})
|
| | unit = semp.get("unit", "[n/a]")
|
| | conf = semp.get("confidence", "unknown")
|
| | notes = semp.get("notes", "")
|
| |
|
| | ex_vals = vals[prop][:6]
|
| | nums = [coerce_num(v) for v in vals[prop]]
|
| | nums = [n for n in nums if n is not None]
|
| | if nums:
|
| | stats = f"min={min(nums):.3f}, p50={statistics.median(nums):.3f}, max={max(nums):.3f}"
|
| | else:
|
| | stats = "[n/a]"
|
| |
|
| | conf_lbl = {"high":"[Verified]","medium":"[Inference]","low":"[Unverified]"}.get(conf, "[Unverified]")
|
| |
|
| | lines.append(f"### `{prop}` \n- **count:** {count} \n- **unit:** {unit} \n- **confidence:** {conf_lbl} \n- **notes:** {notes or '[n/a]'} \n- **examples:** {ex_vals} \n- **numeric stats:** {stats}\n")
|
| |
|
| | out = Path("kb/cheatsheets/property_glossary.md")
|
| | out.parent.mkdir(parents=True, exist_ok=True)
|
| | out.write_text("\n".join(lines), encoding="utf-8")
|
| | print("[ok] wrote", out.resolve())
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|