# -*- coding: utf-8 -*-
"""
make_property_glossary.py
Summarize properties from AttackPlan JSONL and merge semantics from kb/specs/point_semantics.json.

Usage:
  python scripts/make_property_glossary.py \
    --src scripts/train_attackplan.filtered.jsonl \
    --sem kb/specs/point_semantics.json
"""
from __future__ import annotations
import argparse, json, statistics
from pathlib import Path
from collections import Counter, defaultdict

def load_plans(src: Path):
  raw = src.read_text(encoding="utf-8-sig").splitlines()
  plans = []
  for ln in raw:
    s = ln.strip()
    if not s: continue
    try:
      obj = json.loads(s)
    except Exception:
      continue
    if isinstance(obj, dict) and isinstance(obj.get("plan"), list):
      plans.append(obj)
  return plans

def coerce_num(x):
  try: return float(str(x))
  except Exception: return None

def main():
  ap = argparse.ArgumentParser()
  ap.add_argument("--src", required=True, help="AttackPlan JSONL (filtered)")
  ap.add_argument("--sem", required=True, help="Semantics JSON (kb/specs/point_semantics.json)")
  args = ap.parse_args()

  src = Path(args.src)
  sem = json.loads(Path(args.sem).read_text(encoding="utf-8"))
  props_sem = sem.get("properties", {})

  plans = load_plans(src)
  if not plans:
    raise SystemExit(f"No AttackPlan rows in {src}")

  freq = Counter()
  vals = defaultdict(list)
  ops  = Counter()
  applys = Counter()

  for plan in plans:
    for it in plan.get("plan", []):
      p = it.get("point","")
      freq[p]+=1
      ops[it.get("op","set")] += 1
      applys[(it.get("scope") or {}).get("apply","both")] += 1
      v = it.get("attack_value", None)
      if v is not None: vals[p].append(v)

  lines = []
  lines.append("# Property Glossary (auto-generated)\n")
  lines.append("**REALITY FILTER:** Items reflect your filtered dataset and semantics. If a property has low confidence in the semantics file, verify before relying on it.\n")
  lines.append(f"- Source file: `{src}`\n")
  lines.append(f"- Total plans: {len(plans)}\n")
  lines.append(f"- Operation distribution: {dict(ops)}\n")
  lines.append(f"- Scope.apply distribution: {dict(applys)}\n")
  lines.append("\n---\n## Properties\n")

  for prop, count in freq.most_common():
    semp = props_sem.get(prop, {})
    unit = semp.get("unit", "[n/a]")
    conf = semp.get("confidence", "unknown")
    notes = semp.get("notes", "")
    # example values
    ex_vals = vals[prop][:6]
    nums = [coerce_num(v) for v in vals[prop]]
    nums = [n for n in nums if n is not None]
    if nums:
      stats = f"min={min(nums):.3f}, p50={statistics.median(nums):.3f}, max={max(nums):.3f}"
    else:
      stats = "[n/a]"
    # confidence label
    conf_lbl = {"high":"[Verified]","medium":"[Inference]","low":"[Unverified]"}.get(conf, "[Unverified]")
    # write section
    lines.append(f"### `{prop}`  \n- **count:** {count}  \n- **unit:** {unit}  \n- **confidence:** {conf_lbl}  \n- **notes:** {notes or '[n/a]'}  \n- **examples:** {ex_vals}  \n- **numeric stats:** {stats}\n")

  out = Path("kb/cheatsheets/property_glossary.md")
  out.parent.mkdir(parents=True, exist_ok=True)
  out.write_text("\n".join(lines), encoding="utf-8")
  print("[ok] wrote", out.resolve())

if __name__ == "__main__":
  main()