"""Cross-distribution honesty check for the plagiarism-TYPE classifier. The classifier was trained on synthetic gen_corpus variants; its reported held-out accuracy used a split from the SAME generator (circular). Here we build the same kind of (variant -> true type) cases from REAL PLOS academic text the generator never produced, push them through the CURRENT evidence pipeline (now the MiniLM+Siamese ensemble), and measure: 1. TYPE accuracy per class (does the learned feature->type map generalise?) 2. The thing the VERDICT actually depends on: does the evidence SCORE separate reuse (clone/find_replace/mosaic) from clean? (verdict = score + rules, the type label is secondary/cosmetic.) If type accuracy collapses but score-separation holds, the honest takeaway is: trust the verdict, treat the fine-grained type as a hint. """ import os import sys import numpy as np ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, ROOT) from plagdetect.evidence import EvidenceWeights, compare_section # noqa: E402 from plagdetect.forensics import CLASSES, rule_type # noqa: E402 from plagdetect.ingestion import Document # noqa: E402 from plagdetect.siamese import EnsembleEncoder # noqa: E402 from plagdetect.textutils import mosaic_mix, sentences, synonymize # noqa: E402 from plagdetect.understanding import build_idf # noqa: E402 import json PLOS = os.path.join(ROOT, "data", "oa_corpus", "plos_complex_systems.jsonl") def plos_docs(limit=40, cap_sents=12): """Real PLOS docs; cap the worked section to a handful of sentences so the MiniLM ensemble stays fast (type signatures don't need the whole paper).""" docs = [] for r in json.loads("[" + ",".join(open(PLOS, encoding="utf-8")) + "]")[:limit]: ft = r.get("fulltext", "") or "" intro = " ".join(sentences(ft)[:cap_sents]) results = " ".join(sentences(ft)[cap_sents:cap_sents * 2]) sec = {"abstract": r.get("abstract", ""), "introduction": intro, "results": results} docs.append(Document(doc_id=r["doi"], title=r["title"], sections=sec, references=[a for a in r.get("authors", [])])) return docs def _word_dropout(text, rng, p=0.15): w = text.split() kept = [x for x in w if rng.rand() > p] return " ".join(kept if len(kept) > 3 else w) def main(): rng = np.random.RandomState(7) docs = [d for d in plos_docs() if len(d.sections.get("introduction", "")) > 400] idf, default_idf = build_idf(docs) enc = EnsembleEncoder.load(os.path.join(ROOT, "models", "siamese.npz")) weights = EvidenceWeights.load(os.path.join(ROOT, "models", "weights.json")) y_true, y_pred, scores_by_type = [], [], {c: [] for c in CLASSES} n = 0 for src in docs: sec_text = src.sections["introduction"] src_sents = sentences(sec_text) if len(src_sents) < 4: continue other = docs[rng.randint(len(docs))] while other.doc_id == src.doc_id: other = docs[rng.randint(len(docs))] variants = { "clean": other.sections["introduction"], "clone": sec_text, "find_replace": synonymize(sec_text, rng, p=0.9), "mosaic": mosaic_mix(src_sents, sentences(other.sections["introduction"]), rng), } for label, text in variants.items(): ev = compare_section(text, src.references, src, enc, idf, default_idf) cls, prob, dist = rule_type(ev["features"]) score = weights.score(ev["features"]) y_true.append(label) y_pred.append(cls) scores_by_type[label].append(score) n += 1 print(f" ...{n} docs done", flush=True) print(f"cross-distribution cases: {n} source docs x 4 variants = {len(y_true)}") print("\n1) TYPE-CLASSIFIER accuracy on REAL text (was overfit to gen_corpus):") labels_used = ["clean", "clone", "find_replace", "mosaic"] for lab in labels_used: idxs = [i for i, t in enumerate(y_true) if t == lab] acc = np.mean([y_pred[i] == lab for i in idxs]) if idxs else 0.0 # most common prediction for this true label from collections import Counter common = Counter(y_pred[i] for i in idxs).most_common(2) print(f" {lab:13s} acc={acc:4.0%} predicted-as={common}") overall = np.mean([p == t for p, t in zip(y_pred, y_true)]) print(f" OVERALL 5-way accuracy = {overall:.0%} (noisy: find_replace/mosaic" " are near-verbatim on real text, all confused AMONG reuse types)") # The verdict-relevant axis: REUSE vs CLEAN (mislabelling among reuse types is # cosmetic; calling clean a reuse-type, or reuse 'clean', is what hurts). print("\n verdict-relevant BINARY (reuse vs clean):") clean_idx = [i for i, t in enumerate(y_true) if t == "clean"] reuse_idx = [i for i, t in enumerate(y_true) if t != "clean"] clean_ok = np.mean([y_pred[i] == "clean" for i in clean_idx]) reuse_ok = np.mean([y_pred[i] != "clean" for i in reuse_idx]) print(f" clean kept clean (specificity) = {clean_ok:.0%}") print(f" reuse flagged as some reuse-type (recall) = {reuse_ok:.0%}") print("\n2) EVIDENCE SCORE separation (what the VERDICT uses):") clean_mu = np.mean(scores_by_type["clean"]) for lab in labels_used: mu = np.mean(scores_by_type[lab]) print(f" {lab:13s} mean score = {mu:.3f}") reuse = np.concatenate([scores_by_type[l] for l in ("clone", "find_replace", "mosaic")]) print(f"\n clean mean {clean_mu:.3f} vs reuse mean {reuse.mean():.3f} " f"-> gap {reuse.mean()-clean_mu:+.3f}") # verdict floor is EVIDENCE_FLOOR=0.28 (pipeline); how cleanly do we separate? clean_above = np.mean(np.array(scores_by_type["clean"]) >= 0.28) reuse_above = np.mean(reuse >= 0.28) print(f" reuse above evidence-floor(0.28): {reuse_above:.0%} " f"clean false-trigger: {clean_above:.0%}") if __name__ == "__main__": main()