"""Smoke test for the real-world web pipeline against live academic APIs.""" import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from plagdetect.webpipeline import analyze_document # noqa: E402 ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) path = sys.argv[1] if len(sys.argv) > 1 else os.path.join(ROOT, "data", "test_real.txt") r = analyze_document(path, progress=lambda m: print(" >", m)) print() print("VERDICT:", r["verdict"], "|", "; ".join(r["verdict_reasons"])) print("similarity_index:", r["similarity_index"], "% risk:", r["plagiarism_risk"]) print("ai_score:", r["ai_score"], r["ai_band"], "| detectors:", {d["name"]: d["score"] for d in r["ai_detectors"]}) print("semantic:", r["semantic_enabled"], "| obfuscation spoof:", r["obfuscation"]["spoof_suspected"]) print("coverage providers:", r["coverage"]["providers"]) print("deep-read PDFs:", len(r["coverage"]["deep_read_pdfs"]), "| manifest case:", r["case_id"]) print("near_misses:", [(n["title"][:40], n["match_percent"]) for n in r["near_misses"]]) print("candidates:", r["candidates_screened"], "| elapsed:", r["elapsed_seconds"], "s") for s in r["sources"][:6]: print(f" {s['match_percent']:>5}% exact={s['exact_fingerprint_overlap']:>4}% " f"deep={s['deep_scraped']} {s['title'][:64]} " f"({s['provider']}, {s['year']})") if r["sources"] and r["sources"][0]["top_pairs"]: p = r["sources"][0]["top_pairs"][0] print("\nstrongest pair sim", p["similarity"]) print(" SUB:", p["submitted"][:120]) print(" SRC:", p["source"][:120])