Spaces:
Running
Running
| """Generate synthetic perturbations from a clean RAG-example JSONL. | |
| Outputs ``data/samples/perturbation_examples.jsonl`` (overwrites). Each | |
| perturbation is clearly marked with ``is_synthetic: true`` and a | |
| ``perturbation`` tag so downstream analysis can group results by type. | |
| Perturbations: | |
| - drop_citations: remove every [doc...] marker from the answer. | |
| - swap_citations: replace each citation with a different doc id from the example. | |
| - fact_flip: replace key tokens with a hard-coded antonym list (illustrative; | |
| *not* a substitute for hand-written contradictions). | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import random | |
| import re | |
| from pathlib import Path | |
| CITATION_RE = re.compile(r"\[[^\]\[]+\]") | |
| ANTONYMS = { | |
| "is": "is not", | |
| "was": "was not", | |
| "are": "are not", | |
| "first": "last", | |
| "largest": "smallest", | |
| "highest": "lowest", | |
| "boils": "freezes", | |
| "supports": "contradicts", | |
| } | |
| def _drop_citations(answer: str) -> str: | |
| out = CITATION_RE.sub("", answer) | |
| out = re.sub(r"\s+([.,;:!?])", r"\1", out) # collapse "Canberra ." -> "Canberra." | |
| return re.sub(r"\s+", " ", out).strip() | |
| def _swap_citations(answer: str, doc_ids: list[str], rng: random.Random) -> str: | |
| if not doc_ids: | |
| return answer | |
| def swap(match: re.Match) -> str: | |
| new = rng.choice(doc_ids) | |
| return f"[{new}]" | |
| return CITATION_RE.sub(swap, answer) | |
| def _fact_flip(answer: str) -> str: | |
| out = answer | |
| for word, repl in ANTONYMS.items(): | |
| out = re.sub(rf"\b{re.escape(word)}\b", repl, out) | |
| return out | |
| def perturb(example: dict, rng: random.Random) -> list[dict]: | |
| doc_ids = [c.get("doc_id", "") for c in example.get("contexts") or []] | |
| base_id = example.get("example_id", "ex") | |
| variants = [ | |
| ("drop_citations", _drop_citations(example.get("answer", ""))), | |
| ("swap_citations", _swap_citations(example.get("answer", ""), doc_ids, rng)), | |
| ("fact_flip", _fact_flip(example.get("answer", ""))), | |
| ] | |
| out: list[dict] = [] | |
| for tag, new_answer in variants: | |
| if new_answer == example.get("answer", ""): | |
| # Skip no-op perturbations. | |
| continue | |
| v = dict(example) | |
| v["example_id"] = f"{base_id}__{tag}" | |
| v["answer"] = new_answer | |
| v["is_synthetic"] = True | |
| v["perturbation"] = tag | |
| # We do NOT invent gold labels for the perturbation; drop them so the | |
| # downstream pipeline reports unsupervised diagnoses for these rows. | |
| v["gold_claim_labels"] = [] | |
| out.append(v) | |
| return out | |
| def main() -> int: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument( | |
| "--input", | |
| default="data/samples/custom_rag_examples.jsonl", | |
| help="Source JSONL of clean RAG examples.", | |
| ) | |
| ap.add_argument( | |
| "--output", | |
| default="data/samples/perturbation_examples.jsonl", | |
| help="Where to write the synthetic perturbations.", | |
| ) | |
| ap.add_argument("--seed", type=int, default=42) | |
| args = ap.parse_args() | |
| rng = random.Random(args.seed) | |
| src = Path(args.input) | |
| dst = Path(args.output) | |
| if not src.exists(): | |
| print(f"ERROR: source not found: {src}") | |
| return 1 | |
| n_in = n_out = 0 | |
| dst.parent.mkdir(parents=True, exist_ok=True) | |
| with open(src, "r", encoding="utf-8") as f_in, open(dst, "w", encoding="utf-8") as f_out: | |
| for line in f_in: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| ex = json.loads(line) | |
| n_in += 1 | |
| for v in perturb(ex, rng): | |
| f_out.write(json.dumps(v, ensure_ascii=False) + "\n") | |
| n_out += 1 | |
| print(f"Read {n_in} examples; wrote {n_out} synthetic perturbations to {dst}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |