Plaiglab / scripts /eval_ai_only.py
SanidhyaDhangar's picture
PlaigLab — Hugging Face Space (Docker) clean deploy
ebebfe8
Raw
History Blame Contribute Delete
2.4 kB
"""Fast, NETWORK-FREE AI evaluation: extract each draft's text and run the AI
ensemble only (no scraping), dumping every detector score + the fused score
next to the Turnitin AI ground truth. This is the cheap signal for calibrating
the AI head (detect_ai is deterministic and text-only).
Writes data/ai_features.json and prints a table.
"""
import json, os, sys
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, ROOT)
from plagdetect.webpipeline import extract_text, split_body_references # noqa
from plagdetect.aidetect import detect_ai # noqa
from plagdetect import semantic # noqa
from plagdetect.textutils import sentences, tokenize # noqa
DSET = os.path.join(ROOT, "DATASET FOR training of turnitin")
GT = os.path.join(ROOT, "data", "turnitin_groundtruth.json")
OUT = os.path.join(ROOT, "data", "ai_features.json")
def ai_label(rec):
a = (rec.get("ai") or {}).get("ai_pct")
return a
def main():
gt = json.load(open(GT, encoding="utf-8"))
rows = []
for rec in gt:
draft = rec.get("draft")
ai_t = ai_label(rec)
if not draft or ai_t is None:
continue
path = os.path.join(DSET, draft)
try:
_title, text = extract_text(path)
body, _ = split_body_references(text)
sents = [s for s in sentences(body) if len(tokenize(s)) >= 6]
embs = semantic.embed(sents) if semantic.available() else None
r = detect_ai(body, embs=embs)
except Exception as e:
print(f"{draft[:30]:31s} ERROR {e}")
continue
dets = {d["name"]: d["score"] for d in r["detectors"]}
row = {"draft": draft, "ai_truth": ai_t,
"ai_truth_num": (10 if ai_t == "*" else ai_t), # '*' = <20, proxy 10
"suppressed": ai_t == "*",
"score": r["score"], "band": r["band"], "fusion": r["fusion"],
"p_ai": (r.get("conformal") or {}).get("p_ai"),
"detectors": dets}
rows.append(row)
json.dump(rows, open(OUT, "w", encoding="utf-8"), indent=2)
det_str = " ".join(f"{k[:4]}={v:.0f}" for k, v in dets.items())
print(f"{draft[:28]:29s} aiT={str(ai_t):>3s} score={r['score']:>5.1f} "
f"p={row['p_ai']} [{det_str}]")
print(f"\nwrote {OUT} ({len(rows)} papers)")
if __name__ == "__main__":
main()