pubguard-classifier / scripts /pubguard_gate.py
jimnoneill's picture
Clean gate script β€” no errors module dependency
b91194e verified
#!/usr/bin/env python3
"""
PubGuard gate for pipeline integration.
Reads extracted PDF text from stdin or a file, screens it, and:
- Prints verdict JSON to STDERR (for debugging)
- Prints PASS/FAIL to STDERR
- Exits 0 (pass) or 1 (fail)
Usage:
echo "$PDF_TEXT" | python3 pub_check/scripts/pubguard_gate.py
Environment variables:
PUBGUARD_MODELS_DIR – Override models directory
PUBGUARD_STRICT – Set to "0" to warn instead of gate (exit 0 always)
"""
import json
import sys
import os
import logging
logging.basicConfig(
level=logging.WARNING,
format="%(asctime)s | %(levelname)s | %(message)s",
datefmt="%H:%M:%S",
)
from pubguard import PubGuard, PubGuardConfig
def main():
if len(sys.argv) > 1 and sys.argv[1] != "-":
with open(sys.argv[1], errors="replace") as f:
text = f.read()
else:
text = sys.stdin.read()
if not text.strip():
print("PUBGUARD: Empty input", file=sys.stderr)
sys.exit(1)
config = PubGuardConfig()
strict = os.environ.get("PUBGUARD_STRICT", "1") != "0"
guard = PubGuard(config=config)
guard.initialize()
verdict = guard.screen(text)
print(json.dumps(verdict), file=sys.stderr)
if verdict["pass"]:
print("PUBGUARD: PASS", file=sys.stderr)
sys.exit(0)
else:
reasons = []
if verdict["doc_type"]["label"] != "scientific_paper":
reasons.append(f"doc_type={verdict['doc_type']['label']}")
if verdict["ai_generated"]["label"] == "ai_generated":
reasons.append(f"ai_generated (score={verdict['ai_generated']['score']:.2f})")
if verdict["toxicity"]["label"] == "toxic":
reasons.append(f"toxic (score={verdict['toxicity']['score']:.2f})")
print(f"PUBGUARD: FAIL β€” {', '.join(reasons)}", file=sys.stderr)
if strict:
sys.exit(1)
else:
print("PUBGUARD: Running in non-strict mode, continuing...", file=sys.stderr)
sys.exit(0)
if __name__ == "__main__":
main()