"""Cross-distribution honesty check for the plagiarism-TYPE classifier.

The classifier was trained on synthetic gen_corpus variants; its reported
held-out accuracy used a split from the SAME generator (circular). Here we build
the same kind of (variant -> true type) cases from REAL PLOS academic text the
generator never produced, push them through the CURRENT evidence pipeline (now
the MiniLM+Siamese ensemble), and measure:

  1. TYPE accuracy per class (does the learned feature->type map generalise?)
  2. The thing the VERDICT actually depends on: does the evidence SCORE separate
     reuse (clone/find_replace/mosaic) from clean? (verdict = score + rules, the
     type label is secondary/cosmetic.)

If type accuracy collapses but score-separation holds, the honest takeaway is:
trust the verdict, treat the fine-grained type as a hint.
"""
import os
import sys

import numpy as np

ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, ROOT)
from plagdetect.evidence import EvidenceWeights, compare_section  # noqa: E402
from plagdetect.forensics import CLASSES, rule_type               # noqa: E402
from plagdetect.ingestion import Document                          # noqa: E402
from plagdetect.siamese import EnsembleEncoder                     # noqa: E402
from plagdetect.textutils import mosaic_mix, sentences, synonymize  # noqa: E402
from plagdetect.understanding import build_idf                     # noqa: E402

import json

PLOS = os.path.join(ROOT, "data", "oa_corpus", "plos_complex_systems.jsonl")


def plos_docs(limit=40, cap_sents=12):
    """Real PLOS docs; cap the worked section to a handful of sentences so the
    MiniLM ensemble stays fast (type signatures don't need the whole paper)."""
    docs = []
    for r in json.loads("[" + ",".join(open(PLOS, encoding="utf-8")) + "]")[:limit]:
        ft = r.get("fulltext", "") or ""
        intro = " ".join(sentences(ft)[:cap_sents])
        results = " ".join(sentences(ft)[cap_sents:cap_sents * 2])
        sec = {"abstract": r.get("abstract", ""),
               "introduction": intro, "results": results}
        docs.append(Document(doc_id=r["doi"], title=r["title"], sections=sec,
                             references=[a for a in r.get("authors", [])]))
    return docs


def _word_dropout(text, rng, p=0.15):
    w = text.split()
    kept = [x for x in w if rng.rand() > p]
    return " ".join(kept if len(kept) > 3 else w)


def main():
    rng = np.random.RandomState(7)
    docs = [d for d in plos_docs() if len(d.sections.get("introduction", "")) > 400]
    idf, default_idf = build_idf(docs)
    enc = EnsembleEncoder.load(os.path.join(ROOT, "models", "siamese.npz"))
    weights = EvidenceWeights.load(os.path.join(ROOT, "models", "weights.json"))

    y_true, y_pred, scores_by_type = [], [], {c: [] for c in CLASSES}
    n = 0
    for src in docs:
        sec_text = src.sections["introduction"]
        src_sents = sentences(sec_text)
        if len(src_sents) < 4:
            continue
        other = docs[rng.randint(len(docs))]
        while other.doc_id == src.doc_id:
            other = docs[rng.randint(len(docs))]
        variants = {
            "clean": other.sections["introduction"],
            "clone": sec_text,
            "find_replace": synonymize(sec_text, rng, p=0.9),
            "mosaic": mosaic_mix(src_sents, sentences(other.sections["introduction"]), rng),
        }
        for label, text in variants.items():
            ev = compare_section(text, src.references, src, enc, idf, default_idf)
            cls, prob, dist = rule_type(ev["features"])
            score = weights.score(ev["features"])
            y_true.append(label)
            y_pred.append(cls)
            scores_by_type[label].append(score)
        n += 1
        print(f"  ...{n} docs done", flush=True)

    print(f"cross-distribution cases: {n} source docs x 4 variants = {len(y_true)}")
    print("\n1) TYPE-CLASSIFIER accuracy on REAL text (was overfit to gen_corpus):")
    labels_used = ["clean", "clone", "find_replace", "mosaic"]
    for lab in labels_used:
        idxs = [i for i, t in enumerate(y_true) if t == lab]
        acc = np.mean([y_pred[i] == lab for i in idxs]) if idxs else 0.0
        # most common prediction for this true label
        from collections import Counter
        common = Counter(y_pred[i] for i in idxs).most_common(2)
        print(f"   {lab:13s} acc={acc:4.0%}   predicted-as={common}")
    overall = np.mean([p == t for p, t in zip(y_pred, y_true)])
    print(f"   OVERALL 5-way accuracy = {overall:.0%}  (noisy: find_replace/mosaic"
          " are near-verbatim on real text, all confused AMONG reuse types)")

    # The verdict-relevant axis: REUSE vs CLEAN (mislabelling among reuse types is
    # cosmetic; calling clean a reuse-type, or reuse 'clean', is what hurts).
    print("\n   verdict-relevant BINARY (reuse vs clean):")
    clean_idx = [i for i, t in enumerate(y_true) if t == "clean"]
    reuse_idx = [i for i, t in enumerate(y_true) if t != "clean"]
    clean_ok = np.mean([y_pred[i] == "clean" for i in clean_idx])
    reuse_ok = np.mean([y_pred[i] != "clean" for i in reuse_idx])
    print(f"     clean kept clean (specificity) = {clean_ok:.0%}")
    print(f"     reuse flagged as some reuse-type (recall) = {reuse_ok:.0%}")

    print("\n2) EVIDENCE SCORE separation (what the VERDICT uses):")
    clean_mu = np.mean(scores_by_type["clean"])
    for lab in labels_used:
        mu = np.mean(scores_by_type[lab])
        print(f"   {lab:13s} mean score = {mu:.3f}")
    reuse = np.concatenate([scores_by_type[l] for l in ("clone", "find_replace", "mosaic")])
    print(f"\n   clean mean {clean_mu:.3f}  vs reuse mean {reuse.mean():.3f}  "
          f"-> gap {reuse.mean()-clean_mu:+.3f}")
    # verdict floor is EVIDENCE_FLOOR=0.28 (pipeline); how cleanly do we separate?
    clean_above = np.mean(np.array(scores_by_type["clean"]) >= 0.28)
    reuse_above = np.mean(reuse >= 0.28)
    print(f"   reuse above evidence-floor(0.28): {reuse_above:.0%}   "
          f"clean false-trigger: {clean_above:.0%}")


if __name__ == "__main__":
    main()