"""Train all learned components and persist them to models/: 1. Siamese semantic encoder (DL, contrastive loss, pure-numpy backprop) 2. DQN investigation planner (RL, replay buffer + target network) 3. Plagiarism-type classifier (DL, softmax MLP over evidence features) 4. Causal query bandit + Bayesian evidence weights (initial priors) """ import json import os import sys import time import numpy as np sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from plagdetect.causal import CausalQueryBandit # noqa: E402 from plagdetect.evidence import EvidenceWeights, compare_section # noqa: E402 from plagdetect.forensics import CLASSES, TypeClassifier # noqa: E402 from plagdetect.ingestion import load_corpus # noqa: E402 from plagdetect.rl_planner import train_dqn # noqa: E402 from plagdetect.siamese import SiameseEncoder # noqa: E402 from plagdetect.textutils import mosaic_mix, sentences, synonymize # noqa: E402 from plagdetect.understanding import build_idf # noqa: E402 from gen_corpus import TOPICS, render_sections # noqa: E402 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) CORPUS_DIR = os.path.join(ROOT, "data", "corpus") MODELS_DIR = os.path.join(ROOT, "models") def word_dropout(text, rng, p=0.15): words = text.split() kept = [w for w in words if rng.rand() > p] return " ".join(kept if len(kept) > 3 else words) def build_siamese_pairs(corpus, rng, n_pos=1500, n_neg=1500): all_sents = [] for d in corpus: for sec in ("introduction", "literature_review", "methodology", "results"): all_sents.extend(sentences(d.sections.get(sec, ""))) pairs = [] for _ in range(n_pos): s = all_sents[rng.randint(len(all_sents))] t = synonymize(s, rng, p=0.7) if rng.rand() < 0.5 else word_dropout(s, rng) pairs.append((s, t, 1)) for _ in range(n_neg): a = all_sents[rng.randint(len(all_sents))] b = all_sents[rng.randint(len(all_sents))] if a[:40] != b[:40]: pairs.append((a, b, 0)) return pairs def build_classifier_data(corpus, encoder, idf, default_idf, rng, n_per=110): """Self-consistent training: transforms -> real evidence features -> label.""" raws = {} for fn in os.listdir(CORPUS_DIR): if fn.endswith(".json"): with open(os.path.join(CORPUS_DIR, fn), "r", encoding="utf-8") as f: raws[fn[:-5]] = json.load(f) by_topic = {} for d in corpus: by_topic.setdefault(raws[d.doc_id]["topic"], []).append(d) X, y = [], [] secs = ["literature_review", "methodology", "results"] docs = list(corpus) for it in range(n_per): src = docs[rng.randint(len(docs))] topic = raws[src.doc_id]["topic"] sec = secs[rng.randint(len(secs))] src_text = src.sections[sec] src_sents = sentences(src_text) other_topic = [k for k in by_topic if k != topic][rng.randint(len(by_topic) - 1)] other = by_topic[other_topic][rng.randint(len(by_topic[other_topic]))] variants = { "clean": (other.sections[sec], other.references), "clone": (src_text, list(src.references)), "find_replace": (synonymize(src_text, rng, p=0.9), list(src.references[:6])), "mosaic": (mosaic_mix(src_sents, sentences(other.sections[sec]), rng), list(src.references[:5]) + list(other.references[:3])), "idea": (render_sections(TOPICS[topic], raws[src.doc_id]["vals"], np.random.RandomState(rng.randint(10_000)), alt=True)[sec], list(src.references[:6])), } for label, (text, refs) in variants.items(): ev = compare_section(text, refs, src, encoder, idf, default_idf) X.append(ev["features"]) y.append(CLASSES.index(label)) if (it + 1) % 25 == 0: print(f" classifier data {5 * (it + 1)}/{5 * n_per} samples") return np.array(X), np.array(y) def main(): os.makedirs(MODELS_DIR, exist_ok=True) rng = np.random.RandomState(5) corpus = load_corpus(CORPUS_DIR) idf, default_idf = build_idf(corpus) print(f"[1/4] Siamese encoder -- corpus={len(corpus)} docs") t0 = time.time() enc = SiameseEncoder() enc.train(build_siamese_pairs(corpus, rng), epochs=5) enc.save(os.path.join(MODELS_DIR, "siamese.npz")) print(f" saved siamese.npz ({time.time() - t0:.1f}s)") print("[2/4] DQN investigation planner") t0 = time.time() qnet, returns = train_dqn(episodes=900) qnet.save(os.path.join(MODELS_DIR, "dqn.npz")) print(f" saved dqn.npz, final avg return " f"{np.mean(returns[-100:]):+.2f} ({time.time() - t0:.1f}s)") print("[3/4] Plagiarism-type classifier (features via real evidence pipeline)") t0 = time.time() X, y = build_classifier_data(corpus, enc, idf, default_idf, rng) split = int(0.85 * len(X)) perm = rng.permutation(len(X)) Xtr, ytr = X[perm[:split]], y[perm[:split]] Xte, yte = X[perm[split:]], y[perm[split:]] clf = TypeClassifier() clf.train(Xtr, ytr, epochs=60) preds = [clf.predict(x)[0] for x in Xte] acc = float(np.mean([CLASSES.index(p) == t for p, t in zip(preds, yte)])) clf.save(os.path.join(MODELS_DIR, "classifier.npz")) print(f" saved classifier.npz, held-out acc={acc:.3f} " f"({time.time() - t0:.1f}s)") print("[4/4] Causal bandit priors + Bayesian evidence weights") CausalQueryBandit().save(os.path.join(MODELS_DIR, "bandit.json")) EvidenceWeights().save(os.path.join(MODELS_DIR, "weights.json")) print(" saved bandit.json, weights.json") print("Training complete.") if __name__ == "__main__": main()