Plaiglab / scripts /test_web.py
SanidhyaDhangar's picture
PlaigLab — Hugging Face Space (Docker) clean deploy
ebebfe8
Raw
History Blame Contribute Delete
1.63 kB
"""Smoke test for the real-world web pipeline against live academic APIs."""
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from plagdetect.webpipeline import analyze_document # noqa: E402
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(ROOT, "data", "test_real.txt")
r = analyze_document(path, progress=lambda m: print(" >", m))
print()
print("VERDICT:", r["verdict"], "|", "; ".join(r["verdict_reasons"]))
print("similarity_index:", r["similarity_index"], "% risk:", r["plagiarism_risk"])
print("ai_score:", r["ai_score"], r["ai_band"],
"| detectors:", {d["name"]: d["score"] for d in r["ai_detectors"]})
print("semantic:", r["semantic_enabled"],
"| obfuscation spoof:", r["obfuscation"]["spoof_suspected"])
print("coverage providers:", r["coverage"]["providers"])
print("deep-read PDFs:", len(r["coverage"]["deep_read_pdfs"]),
"| manifest case:", r["case_id"])
print("near_misses:", [(n["title"][:40], n["match_percent"]) for n in r["near_misses"]])
print("candidates:", r["candidates_screened"], "| elapsed:", r["elapsed_seconds"], "s")
for s in r["sources"][:6]:
print(f" {s['match_percent']:>5}% exact={s['exact_fingerprint_overlap']:>4}% "
f"deep={s['deep_scraped']} {s['title'][:64]} "
f"({s['provider']}, {s['year']})")
if r["sources"] and r["sources"][0]["top_pairs"]:
p = r["sources"][0]["top_pairs"][0]
print("\nstrongest pair sim", p["similarity"])
print(" SUB:", p["submitted"][:120])
print(" SRC:", p["source"][:120])