ds6b-attackplan-qlora / scripts /make_property_glossary.py
adetuire1's picture
Upload folder using huggingface_hub
fba140f verified
# -*- coding: utf-8 -*-
"""
make_property_glossary.py
Summarize properties from AttackPlan JSONL and merge semantics from kb/specs/point_semantics.json.
Usage:
python scripts/make_property_glossary.py \
--src scripts/train_attackplan.filtered.jsonl \
--sem kb/specs/point_semantics.json
"""
from __future__ import annotations
import argparse, json, statistics
from pathlib import Path
from collections import Counter, defaultdict
def load_plans(src: Path):
raw = src.read_text(encoding="utf-8-sig").splitlines()
plans = []
for ln in raw:
s = ln.strip()
if not s: continue
try:
obj = json.loads(s)
except Exception:
continue
if isinstance(obj, dict) and isinstance(obj.get("plan"), list):
plans.append(obj)
return plans
def coerce_num(x):
try: return float(str(x))
except Exception: return None
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--src", required=True, help="AttackPlan JSONL (filtered)")
ap.add_argument("--sem", required=True, help="Semantics JSON (kb/specs/point_semantics.json)")
args = ap.parse_args()
src = Path(args.src)
sem = json.loads(Path(args.sem).read_text(encoding="utf-8"))
props_sem = sem.get("properties", {})
plans = load_plans(src)
if not plans:
raise SystemExit(f"No AttackPlan rows in {src}")
freq = Counter()
vals = defaultdict(list)
ops = Counter()
applys = Counter()
for plan in plans:
for it in plan.get("plan", []):
p = it.get("point","")
freq[p]+=1
ops[it.get("op","set")] += 1
applys[(it.get("scope") or {}).get("apply","both")] += 1
v = it.get("attack_value", None)
if v is not None: vals[p].append(v)
lines = []
lines.append("# Property Glossary (auto-generated)\n")
lines.append("**REALITY FILTER:** Items reflect your filtered dataset and semantics. If a property has low confidence in the semantics file, verify before relying on it.\n")
lines.append(f"- Source file: `{src}`\n")
lines.append(f"- Total plans: {len(plans)}\n")
lines.append(f"- Operation distribution: {dict(ops)}\n")
lines.append(f"- Scope.apply distribution: {dict(applys)}\n")
lines.append("\n---\n## Properties\n")
for prop, count in freq.most_common():
semp = props_sem.get(prop, {})
unit = semp.get("unit", "[n/a]")
conf = semp.get("confidence", "unknown")
notes = semp.get("notes", "")
# example values
ex_vals = vals[prop][:6]
nums = [coerce_num(v) for v in vals[prop]]
nums = [n for n in nums if n is not None]
if nums:
stats = f"min={min(nums):.3f}, p50={statistics.median(nums):.3f}, max={max(nums):.3f}"
else:
stats = "[n/a]"
# confidence label
conf_lbl = {"high":"[Verified]","medium":"[Inference]","low":"[Unverified]"}.get(conf, "[Unverified]")
# write section
lines.append(f"### `{prop}` \n- **count:** {count} \n- **unit:** {unit} \n- **confidence:** {conf_lbl} \n- **notes:** {notes or '[n/a]'} \n- **examples:** {ex_vals} \n- **numeric stats:** {stats}\n")
out = Path("kb/cheatsheets/property_glossary.md")
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text("\n".join(lines), encoding="utf-8")
print("[ok] wrote", out.resolve())
if __name__ == "__main__":
main()