Spaces:
Sleeping
Sleeping
| """Fast, NETWORK-FREE AI evaluation: extract each draft's text and run the AI | |
| ensemble only (no scraping), dumping every detector score + the fused score | |
| next to the Turnitin AI ground truth. This is the cheap signal for calibrating | |
| the AI head (detect_ai is deterministic and text-only). | |
| Writes data/ai_features.json and prints a table. | |
| """ | |
| import json, os, sys | |
| ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.insert(0, ROOT) | |
| from plagdetect.webpipeline import extract_text, split_body_references # noqa | |
| from plagdetect.aidetect import detect_ai # noqa | |
| from plagdetect import semantic # noqa | |
| from plagdetect.textutils import sentences, tokenize # noqa | |
| DSET = os.path.join(ROOT, "DATASET FOR training of turnitin") | |
| GT = os.path.join(ROOT, "data", "turnitin_groundtruth.json") | |
| OUT = os.path.join(ROOT, "data", "ai_features.json") | |
| def ai_label(rec): | |
| a = (rec.get("ai") or {}).get("ai_pct") | |
| return a | |
| def main(): | |
| gt = json.load(open(GT, encoding="utf-8")) | |
| rows = [] | |
| for rec in gt: | |
| draft = rec.get("draft") | |
| ai_t = ai_label(rec) | |
| if not draft or ai_t is None: | |
| continue | |
| path = os.path.join(DSET, draft) | |
| try: | |
| _title, text = extract_text(path) | |
| body, _ = split_body_references(text) | |
| sents = [s for s in sentences(body) if len(tokenize(s)) >= 6] | |
| embs = semantic.embed(sents) if semantic.available() else None | |
| r = detect_ai(body, embs=embs) | |
| except Exception as e: | |
| print(f"{draft[:30]:31s} ERROR {e}") | |
| continue | |
| dets = {d["name"]: d["score"] for d in r["detectors"]} | |
| row = {"draft": draft, "ai_truth": ai_t, | |
| "ai_truth_num": (10 if ai_t == "*" else ai_t), # '*' = <20, proxy 10 | |
| "suppressed": ai_t == "*", | |
| "score": r["score"], "band": r["band"], "fusion": r["fusion"], | |
| "p_ai": (r.get("conformal") or {}).get("p_ai"), | |
| "detectors": dets} | |
| rows.append(row) | |
| json.dump(rows, open(OUT, "w", encoding="utf-8"), indent=2) | |
| det_str = " ".join(f"{k[:4]}={v:.0f}" for k, v in dets.items()) | |
| print(f"{draft[:28]:29s} aiT={str(ai_t):>3s} score={r['score']:>5.1f} " | |
| f"p={row['p_ai']} [{det_str}]") | |
| print(f"\nwrote {OUT} ({len(rows)} papers)") | |
| if __name__ == "__main__": | |
| main() | |