ds6b-attackplan-qlora / scripts /make_property_glossary.py

Upload folder using huggingface_hub

fba140f verified 7 months ago

3.36 kB

	# -- coding: utf-8 --
	"""
	make_property_glossary.py
	Summarize properties from AttackPlan JSONL and merge semantics from kb/specs/point_semantics.json.

	Usage:
	python scripts/make_property_glossary.py \
	--src scripts/train_attackplan.filtered.jsonl \
	--sem kb/specs/point_semantics.json
	"""
	from __future__ import annotations
	import argparse, json, statistics
	from pathlib import Path
	from collections import Counter, defaultdict

	def load_plans(src: Path):
	raw = src.read_text(encoding="utf-8-sig").splitlines()
	plans = []
	for ln in raw:
	s = ln.strip()
	if not s: continue
	try:
	obj = json.loads(s)
	except Exception:
	continue
	if isinstance(obj, dict) and isinstance(obj.get("plan"), list):
	plans.append(obj)
	return plans

	def coerce_num(x):
	try: return float(str(x))
	except Exception: return None

	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--src", required=True, help="AttackPlan JSONL (filtered)")
	ap.add_argument("--sem", required=True, help="Semantics JSON (kb/specs/point_semantics.json)")
	args = ap.parse_args()

	src = Path(args.src)
	sem = json.loads(Path(args.sem).read_text(encoding="utf-8"))
	props_sem = sem.get("properties", {})

	plans = load_plans(src)
	if not plans:
	raise SystemExit(f"No AttackPlan rows in {src}")

	freq = Counter()
	vals = defaultdict(list)
	ops = Counter()
	applys = Counter()

	for plan in plans:
	for it in plan.get("plan", []):
	p = it.get("point","")
	freq[p]+=1
	ops[it.get("op","set")] += 1
	applys[(it.get("scope") or {}).get("apply","both")] += 1
	v = it.get("attack_value", None)
	if v is not None: vals[p].append(v)

	lines = []
	lines.append("# Property Glossary (auto-generated)\n")
	lines.append("REALITY FILTER: Items reflect your filtered dataset and semantics. If a property has low confidence in the semantics file, verify before relying on it.\n")
	lines.append(f"- Source file: `{src}`\n")
	lines.append(f"- Total plans: {len(plans)}\n")
	lines.append(f"- Operation distribution: {dict(ops)}\n")
	lines.append(f"- Scope.apply distribution: {dict(applys)}\n")
	lines.append("\n---\n## Properties\n")

	for prop, count in freq.most_common():
	semp = props_sem.get(prop, {})
	unit = semp.get("unit", "[n/a]")
	conf = semp.get("confidence", "unknown")
	notes = semp.get("notes", "")
	# example values
	ex_vals = vals[prop][:6]
	nums = [coerce_num(v) for v in vals[prop]]
	nums = [n for n in nums if n is not None]
	if nums:
	stats = f"min={min(nums):.3f}, p50={statistics.median(nums):.3f}, max={max(nums):.3f}"
	else:
	stats = "[n/a]"
	# confidence label
	conf_lbl = {"high":"[Verified]","medium":"[Inference]","low":"[Unverified]"}.get(conf, "[Unverified]")
	# write section
	lines.append(f"### `{prop}` \n- count: {count} \n- unit: {unit} \n- confidence: {conf_lbl} \n- notes: {notes or '[n/a]'} \n- examples: {ex_vals} \n- numeric stats: {stats}\n")

	out = Path("kb/cheatsheets/property_glossary.md")
	out.parent.mkdir(parents=True, exist_ok=True)
	out.write_text("\n".join(lines), encoding="utf-8")
	print("[ok] wrote", out.resolve())

	if __name__ == "__main__":
	main()