Spaces:
Sleeping
Sleeping
| """Cross-distribution honesty check for the plagiarism-TYPE classifier. | |
| The classifier was trained on synthetic gen_corpus variants; its reported | |
| held-out accuracy used a split from the SAME generator (circular). Here we build | |
| the same kind of (variant -> true type) cases from REAL PLOS academic text the | |
| generator never produced, push them through the CURRENT evidence pipeline (now | |
| the MiniLM+Siamese ensemble), and measure: | |
| 1. TYPE accuracy per class (does the learned feature->type map generalise?) | |
| 2. The thing the VERDICT actually depends on: does the evidence SCORE separate | |
| reuse (clone/find_replace/mosaic) from clean? (verdict = score + rules, the | |
| type label is secondary/cosmetic.) | |
| If type accuracy collapses but score-separation holds, the honest takeaway is: | |
| trust the verdict, treat the fine-grained type as a hint. | |
| """ | |
| import os | |
| import sys | |
| import numpy as np | |
| ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.insert(0, ROOT) | |
| from plagdetect.evidence import EvidenceWeights, compare_section # noqa: E402 | |
| from plagdetect.forensics import CLASSES, rule_type # noqa: E402 | |
| from plagdetect.ingestion import Document # noqa: E402 | |
| from plagdetect.siamese import EnsembleEncoder # noqa: E402 | |
| from plagdetect.textutils import mosaic_mix, sentences, synonymize # noqa: E402 | |
| from plagdetect.understanding import build_idf # noqa: E402 | |
| import json | |
| PLOS = os.path.join(ROOT, "data", "oa_corpus", "plos_complex_systems.jsonl") | |
| def plos_docs(limit=40, cap_sents=12): | |
| """Real PLOS docs; cap the worked section to a handful of sentences so the | |
| MiniLM ensemble stays fast (type signatures don't need the whole paper).""" | |
| docs = [] | |
| for r in json.loads("[" + ",".join(open(PLOS, encoding="utf-8")) + "]")[:limit]: | |
| ft = r.get("fulltext", "") or "" | |
| intro = " ".join(sentences(ft)[:cap_sents]) | |
| results = " ".join(sentences(ft)[cap_sents:cap_sents * 2]) | |
| sec = {"abstract": r.get("abstract", ""), | |
| "introduction": intro, "results": results} | |
| docs.append(Document(doc_id=r["doi"], title=r["title"], sections=sec, | |
| references=[a for a in r.get("authors", [])])) | |
| return docs | |
| def _word_dropout(text, rng, p=0.15): | |
| w = text.split() | |
| kept = [x for x in w if rng.rand() > p] | |
| return " ".join(kept if len(kept) > 3 else w) | |
| def main(): | |
| rng = np.random.RandomState(7) | |
| docs = [d for d in plos_docs() if len(d.sections.get("introduction", "")) > 400] | |
| idf, default_idf = build_idf(docs) | |
| enc = EnsembleEncoder.load(os.path.join(ROOT, "models", "siamese.npz")) | |
| weights = EvidenceWeights.load(os.path.join(ROOT, "models", "weights.json")) | |
| y_true, y_pred, scores_by_type = [], [], {c: [] for c in CLASSES} | |
| n = 0 | |
| for src in docs: | |
| sec_text = src.sections["introduction"] | |
| src_sents = sentences(sec_text) | |
| if len(src_sents) < 4: | |
| continue | |
| other = docs[rng.randint(len(docs))] | |
| while other.doc_id == src.doc_id: | |
| other = docs[rng.randint(len(docs))] | |
| variants = { | |
| "clean": other.sections["introduction"], | |
| "clone": sec_text, | |
| "find_replace": synonymize(sec_text, rng, p=0.9), | |
| "mosaic": mosaic_mix(src_sents, sentences(other.sections["introduction"]), rng), | |
| } | |
| for label, text in variants.items(): | |
| ev = compare_section(text, src.references, src, enc, idf, default_idf) | |
| cls, prob, dist = rule_type(ev["features"]) | |
| score = weights.score(ev["features"]) | |
| y_true.append(label) | |
| y_pred.append(cls) | |
| scores_by_type[label].append(score) | |
| n += 1 | |
| print(f" ...{n} docs done", flush=True) | |
| print(f"cross-distribution cases: {n} source docs x 4 variants = {len(y_true)}") | |
| print("\n1) TYPE-CLASSIFIER accuracy on REAL text (was overfit to gen_corpus):") | |
| labels_used = ["clean", "clone", "find_replace", "mosaic"] | |
| for lab in labels_used: | |
| idxs = [i for i, t in enumerate(y_true) if t == lab] | |
| acc = np.mean([y_pred[i] == lab for i in idxs]) if idxs else 0.0 | |
| # most common prediction for this true label | |
| from collections import Counter | |
| common = Counter(y_pred[i] for i in idxs).most_common(2) | |
| print(f" {lab:13s} acc={acc:4.0%} predicted-as={common}") | |
| overall = np.mean([p == t for p, t in zip(y_pred, y_true)]) | |
| print(f" OVERALL 5-way accuracy = {overall:.0%} (noisy: find_replace/mosaic" | |
| " are near-verbatim on real text, all confused AMONG reuse types)") | |
| # The verdict-relevant axis: REUSE vs CLEAN (mislabelling among reuse types is | |
| # cosmetic; calling clean a reuse-type, or reuse 'clean', is what hurts). | |
| print("\n verdict-relevant BINARY (reuse vs clean):") | |
| clean_idx = [i for i, t in enumerate(y_true) if t == "clean"] | |
| reuse_idx = [i for i, t in enumerate(y_true) if t != "clean"] | |
| clean_ok = np.mean([y_pred[i] == "clean" for i in clean_idx]) | |
| reuse_ok = np.mean([y_pred[i] != "clean" for i in reuse_idx]) | |
| print(f" clean kept clean (specificity) = {clean_ok:.0%}") | |
| print(f" reuse flagged as some reuse-type (recall) = {reuse_ok:.0%}") | |
| print("\n2) EVIDENCE SCORE separation (what the VERDICT uses):") | |
| clean_mu = np.mean(scores_by_type["clean"]) | |
| for lab in labels_used: | |
| mu = np.mean(scores_by_type[lab]) | |
| print(f" {lab:13s} mean score = {mu:.3f}") | |
| reuse = np.concatenate([scores_by_type[l] for l in ("clone", "find_replace", "mosaic")]) | |
| print(f"\n clean mean {clean_mu:.3f} vs reuse mean {reuse.mean():.3f} " | |
| f"-> gap {reuse.mean()-clean_mu:+.3f}") | |
| # verdict floor is EVIDENCE_FLOOR=0.28 (pipeline); how cleanly do we separate? | |
| clean_above = np.mean(np.array(scores_by_type["clean"]) >= 0.28) | |
| reuse_above = np.mean(reuse >= 0.28) | |
| print(f" reuse above evidence-floor(0.28): {reuse_above:.0%} " | |
| f"clean false-trigger: {clean_above:.0%}") | |
| if __name__ == "__main__": | |
| main() | |