argus-redact / build_prvl_cache.py
wan9yu's picture
update demo: 3-tab bilingual redesign + pseudonym-llm + cached LLM proof
1ccfc0d verified
"""Offline baker: project today's PRvL paper-data into demo/prvl_cache.json.
Run once after a fresh PRvL benchmark; the cache is committed alongside app.py.
"""
from __future__ import annotations
import json
from pathlib import Path
SRC = Path.home() / "Desktop/argus-paper-data/2026-05-04/raw/prvl_multi.json"
DST = Path(__file__).resolve().parent / "prvl_cache.json"
CASE_ID = "summarize_zh"
PROFILE = "pseudonym-llm"
MODEL_ORDER = ["GPT-5", "Claude-Opus-4.5", "Gemini-2.5-Pro", "GLM-4.5"]
def main() -> None:
if not SRC.exists():
raise SystemExit(
f"Source PRvL data not found at: {SRC}\n"
f"This baker is a one-shot tool — update SRC to point to your "
f"current PRvL run JSON, or rebake against the latest data."
)
try:
rows_raw = json.loads(SRC.read_text(encoding="utf-8"))
except json.JSONDecodeError as e:
raise SystemExit(f"Source file is not valid JSON: {SRC}\n{e}") from e
matched = [
r for r in rows_raw
if r["case_id"] == CASE_ID and r["profile"] == PROFILE
]
if len(matched) != len(MODEL_ORDER):
raise SystemExit(
f"Expected {len(MODEL_ORDER)} rows for {CASE_ID}/{PROFILE}, got {len(matched)}"
)
by_model = {r["model"]: r for r in matched}
rows_out = []
for model in MODEL_ORDER:
if model not in by_model:
raise SystemExit(f"Missing row for model {model}")
r = by_model[model]
try:
rows_out.append({
"model": model,
"downstream_text": r["redacted"],
"llm_reply": r["output"],
"leaked": r["privacy"]["leaked"],
"total_pii": r["privacy"]["total"],
"utility": r["utility"],
})
except KeyError as e:
raise SystemExit(
f"Malformed source row for model {model}: missing field {e}\n"
f"Expected source schema: case_id, profile, model, redacted, "
f"output, privacy.{{leaked,total}}, utility"
) from e
DST.write_text(
json.dumps(
{
"source_run": "PRvL bench 2026-05-04",
"case_id": CASE_ID,
"profile": PROFILE,
"rows": rows_out,
},
ensure_ascii=False,
indent=2,
),
encoding="utf-8",
)
print(f"✓ wrote {DST} with {len(rows_out)} rows")
if __name__ == "__main__":
main()