Plaiglab / scripts /eval_typeclf.py
SanidhyaDhangar's picture
PlaigLab — Hugging Face Space (Docker) clean deploy
ebebfe8
Raw
History Blame Contribute Delete
6.14 kB
"""Cross-distribution honesty check for the plagiarism-TYPE classifier.
The classifier was trained on synthetic gen_corpus variants; its reported
held-out accuracy used a split from the SAME generator (circular). Here we build
the same kind of (variant -> true type) cases from REAL PLOS academic text the
generator never produced, push them through the CURRENT evidence pipeline (now
the MiniLM+Siamese ensemble), and measure:
1. TYPE accuracy per class (does the learned feature->type map generalise?)
2. The thing the VERDICT actually depends on: does the evidence SCORE separate
reuse (clone/find_replace/mosaic) from clean? (verdict = score + rules, the
type label is secondary/cosmetic.)
If type accuracy collapses but score-separation holds, the honest takeaway is:
trust the verdict, treat the fine-grained type as a hint.
"""
import os
import sys
import numpy as np
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, ROOT)
from plagdetect.evidence import EvidenceWeights, compare_section # noqa: E402
from plagdetect.forensics import CLASSES, rule_type # noqa: E402
from plagdetect.ingestion import Document # noqa: E402
from plagdetect.siamese import EnsembleEncoder # noqa: E402
from plagdetect.textutils import mosaic_mix, sentences, synonymize # noqa: E402
from plagdetect.understanding import build_idf # noqa: E402
import json
PLOS = os.path.join(ROOT, "data", "oa_corpus", "plos_complex_systems.jsonl")
def plos_docs(limit=40, cap_sents=12):
"""Real PLOS docs; cap the worked section to a handful of sentences so the
MiniLM ensemble stays fast (type signatures don't need the whole paper)."""
docs = []
for r in json.loads("[" + ",".join(open(PLOS, encoding="utf-8")) + "]")[:limit]:
ft = r.get("fulltext", "") or ""
intro = " ".join(sentences(ft)[:cap_sents])
results = " ".join(sentences(ft)[cap_sents:cap_sents * 2])
sec = {"abstract": r.get("abstract", ""),
"introduction": intro, "results": results}
docs.append(Document(doc_id=r["doi"], title=r["title"], sections=sec,
references=[a for a in r.get("authors", [])]))
return docs
def _word_dropout(text, rng, p=0.15):
w = text.split()
kept = [x for x in w if rng.rand() > p]
return " ".join(kept if len(kept) > 3 else w)
def main():
rng = np.random.RandomState(7)
docs = [d for d in plos_docs() if len(d.sections.get("introduction", "")) > 400]
idf, default_idf = build_idf(docs)
enc = EnsembleEncoder.load(os.path.join(ROOT, "models", "siamese.npz"))
weights = EvidenceWeights.load(os.path.join(ROOT, "models", "weights.json"))
y_true, y_pred, scores_by_type = [], [], {c: [] for c in CLASSES}
n = 0
for src in docs:
sec_text = src.sections["introduction"]
src_sents = sentences(sec_text)
if len(src_sents) < 4:
continue
other = docs[rng.randint(len(docs))]
while other.doc_id == src.doc_id:
other = docs[rng.randint(len(docs))]
variants = {
"clean": other.sections["introduction"],
"clone": sec_text,
"find_replace": synonymize(sec_text, rng, p=0.9),
"mosaic": mosaic_mix(src_sents, sentences(other.sections["introduction"]), rng),
}
for label, text in variants.items():
ev = compare_section(text, src.references, src, enc, idf, default_idf)
cls, prob, dist = rule_type(ev["features"])
score = weights.score(ev["features"])
y_true.append(label)
y_pred.append(cls)
scores_by_type[label].append(score)
n += 1
print(f" ...{n} docs done", flush=True)
print(f"cross-distribution cases: {n} source docs x 4 variants = {len(y_true)}")
print("\n1) TYPE-CLASSIFIER accuracy on REAL text (was overfit to gen_corpus):")
labels_used = ["clean", "clone", "find_replace", "mosaic"]
for lab in labels_used:
idxs = [i for i, t in enumerate(y_true) if t == lab]
acc = np.mean([y_pred[i] == lab for i in idxs]) if idxs else 0.0
# most common prediction for this true label
from collections import Counter
common = Counter(y_pred[i] for i in idxs).most_common(2)
print(f" {lab:13s} acc={acc:4.0%} predicted-as={common}")
overall = np.mean([p == t for p, t in zip(y_pred, y_true)])
print(f" OVERALL 5-way accuracy = {overall:.0%} (noisy: find_replace/mosaic"
" are near-verbatim on real text, all confused AMONG reuse types)")
# The verdict-relevant axis: REUSE vs CLEAN (mislabelling among reuse types is
# cosmetic; calling clean a reuse-type, or reuse 'clean', is what hurts).
print("\n verdict-relevant BINARY (reuse vs clean):")
clean_idx = [i for i, t in enumerate(y_true) if t == "clean"]
reuse_idx = [i for i, t in enumerate(y_true) if t != "clean"]
clean_ok = np.mean([y_pred[i] == "clean" for i in clean_idx])
reuse_ok = np.mean([y_pred[i] != "clean" for i in reuse_idx])
print(f" clean kept clean (specificity) = {clean_ok:.0%}")
print(f" reuse flagged as some reuse-type (recall) = {reuse_ok:.0%}")
print("\n2) EVIDENCE SCORE separation (what the VERDICT uses):")
clean_mu = np.mean(scores_by_type["clean"])
for lab in labels_used:
mu = np.mean(scores_by_type[lab])
print(f" {lab:13s} mean score = {mu:.3f}")
reuse = np.concatenate([scores_by_type[l] for l in ("clone", "find_replace", "mosaic")])
print(f"\n clean mean {clean_mu:.3f} vs reuse mean {reuse.mean():.3f} "
f"-> gap {reuse.mean()-clean_mu:+.3f}")
# verdict floor is EVIDENCE_FLOOR=0.28 (pipeline); how cleanly do we separate?
clean_above = np.mean(np.array(scores_by_type["clean"]) >= 0.28)
reuse_above = np.mean(reuse >= 0.28)
print(f" reuse above evidence-floor(0.28): {reuse_above:.0%} "
f"clean false-trigger: {clean_above:.0%}")
if __name__ == "__main__":
main()