"""Fast, NETWORK-FREE AI evaluation: extract each draft's text and run the AI ensemble only (no scraping), dumping every detector score + the fused score next to the Turnitin AI ground truth. This is the cheap signal for calibrating the AI head (detect_ai is deterministic and text-only). Writes data/ai_features.json and prints a table. """ import json, os, sys ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, ROOT) from plagdetect.webpipeline import extract_text, split_body_references # noqa from plagdetect.aidetect import detect_ai # noqa from plagdetect import semantic # noqa from plagdetect.textutils import sentences, tokenize # noqa DSET = os.path.join(ROOT, "DATASET FOR training of turnitin") GT = os.path.join(ROOT, "data", "turnitin_groundtruth.json") OUT = os.path.join(ROOT, "data", "ai_features.json") def ai_label(rec): a = (rec.get("ai") or {}).get("ai_pct") return a def main(): gt = json.load(open(GT, encoding="utf-8")) rows = [] for rec in gt: draft = rec.get("draft") ai_t = ai_label(rec) if not draft or ai_t is None: continue path = os.path.join(DSET, draft) try: _title, text = extract_text(path) body, _ = split_body_references(text) sents = [s for s in sentences(body) if len(tokenize(s)) >= 6] embs = semantic.embed(sents) if semantic.available() else None r = detect_ai(body, embs=embs) except Exception as e: print(f"{draft[:30]:31s} ERROR {e}") continue dets = {d["name"]: d["score"] for d in r["detectors"]} row = {"draft": draft, "ai_truth": ai_t, "ai_truth_num": (10 if ai_t == "*" else ai_t), # '*' = <20, proxy 10 "suppressed": ai_t == "*", "score": r["score"], "band": r["band"], "fusion": r["fusion"], "p_ai": (r.get("conformal") or {}).get("p_ai"), "detectors": dets} rows.append(row) json.dump(rows, open(OUT, "w", encoding="utf-8"), indent=2) det_str = " ".join(f"{k[:4]}={v:.0f}" for k, v in dets.items()) print(f"{draft[:28]:29s} aiT={str(ai_t):>3s} score={r['score']:>5.1f} " f"p={row['p_ai']} [{det_str}]") print(f"\nwrote {OUT} ({len(rows)} papers)") if __name__ == "__main__": main()