#!/usr/bin/env python3 """ PubGuard gate for pipeline integration. Reads extracted PDF text from stdin or a file, screens it, and: - Prints verdict JSON to STDERR (for debugging) - Prints PASS/FAIL to STDERR - Exits 0 (pass) or 1 (fail) Usage: echo "$PDF_TEXT" | python3 pub_check/scripts/pubguard_gate.py Environment variables: PUBGUARD_MODELS_DIR – Override models directory PUBGUARD_STRICT – Set to "0" to warn instead of gate (exit 0 always) """ import json import sys import os import logging logging.basicConfig( level=logging.WARNING, format="%(asctime)s | %(levelname)s | %(message)s", datefmt="%H:%M:%S", ) from pubguard import PubGuard, PubGuardConfig def main(): if len(sys.argv) > 1 and sys.argv[1] != "-": with open(sys.argv[1], errors="replace") as f: text = f.read() else: text = sys.stdin.read() if not text.strip(): print("PUBGUARD: Empty input", file=sys.stderr) sys.exit(1) config = PubGuardConfig() strict = os.environ.get("PUBGUARD_STRICT", "1") != "0" guard = PubGuard(config=config) guard.initialize() verdict = guard.screen(text) print(json.dumps(verdict), file=sys.stderr) if verdict["pass"]: print("PUBGUARD: PASS", file=sys.stderr) sys.exit(0) else: reasons = [] if verdict["doc_type"]["label"] != "scientific_paper": reasons.append(f"doc_type={verdict['doc_type']['label']}") if verdict["ai_generated"]["label"] == "ai_generated": reasons.append(f"ai_generated (score={verdict['ai_generated']['score']:.2f})") if verdict["toxicity"]["label"] == "toxic": reasons.append(f"toxic (score={verdict['toxicity']['score']:.2f})") print(f"PUBGUARD: FAIL — {', '.join(reasons)}", file=sys.stderr) if strict: sys.exit(1) else: print("PUBGUARD: Running in non-strict mode, continuing...", file=sys.stderr) sys.exit(0) if __name__ == "__main__": main()