jimnoneill commited on
Commit
5dbd484
·
verified ·
1 Parent(s): 4c8eee0

Upload scripts/pubguard_gate.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/pubguard_gate.py +97 -0
scripts/pubguard_gate.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PubGuard gate for run_pubverse_pipeline.sh integration.
4
+
5
+ Reads extracted PDF text from stdin or a file, screens it, and:
6
+ - Prints the error code string to STDOUT (always, for pipeline capture)
7
+ - Prints verdict JSON and diagnostics to STDERR
8
+ - Exits 0 (pass) → pipeline continues
9
+ - Exits 1 (fail) → pipeline halts with error code
10
+
11
+ Error code format:
12
+ PV-0[doc_type][ai_detect][toxicity] | NAME | snarky message
13
+ PV-0000 = scientific_paper + human + clean = PASS
14
+
15
+ Usage in run_pubverse_pipeline.sh:
16
+ PUBGUARD_RESULT=$(echo "$PDF_TEXT" | python3 pub_check/scripts/pubguard_gate.py)
17
+ PUBGUARD_EXIT=$?
18
+ echo "$PUBGUARD_RESULT" # Error code line on stdout
19
+
20
+ Environment variables:
21
+ PUBGUARD_MODELS_DIR – Override models directory
22
+ PUBGUARD_STRICT – Set to "0" to warn instead of gate (exit 0 always)
23
+ """
24
+
25
+ import json
26
+ import sys
27
+ import os
28
+ import logging
29
+
30
+ logging.basicConfig(
31
+ level=logging.WARNING,
32
+ format="%(asctime)s | %(levelname)s | %(message)s",
33
+ datefmt="%H:%M:%S",
34
+ )
35
+
36
+ from pubguard import PubGuard, PubGuardConfig
37
+ from pubguard.errors import (
38
+ build_pubguard_error,
39
+ empty_input_error,
40
+ gate_bypassed,
41
+ )
42
+
43
+
44
+ def main():
45
+ # Read text from stdin or file argument
46
+ if len(sys.argv) > 1 and sys.argv[1] != "-":
47
+ with open(sys.argv[1], errors="replace") as f:
48
+ text = f.read()
49
+ else:
50
+ text = sys.stdin.read()
51
+
52
+ if not text.strip():
53
+ err = empty_input_error()
54
+ print(str(err)) # stdout: error code line
55
+ print(json.dumps(err.to_dict()), file=sys.stderr)
56
+ sys.exit(1)
57
+
58
+ # Configure
59
+ config = PubGuardConfig()
60
+ strict = os.environ.get("PUBGUARD_STRICT", "1") != "0"
61
+
62
+ # Screen
63
+ guard = PubGuard(config=config)
64
+ guard.initialize()
65
+ verdict = guard.screen(text)
66
+
67
+ # Build structured error code from verdict
68
+ err = build_pubguard_error(verdict)
69
+
70
+ # STDOUT: always print the error code line (pipeline captures this)
71
+ print(str(err))
72
+
73
+ # STDERR: full verdict JSON for debugging
74
+ print(json.dumps(verdict), file=sys.stderr)
75
+
76
+ # Gate decision
77
+ if verdict["pass"]:
78
+ print(f"PUBGUARD: PASS ({err.code})", file=sys.stderr)
79
+ sys.exit(0)
80
+ else:
81
+ print(f"PUBGUARD: FAIL ({err.code})", file=sys.stderr)
82
+
83
+ if strict and err.fatal:
84
+ sys.exit(1)
85
+ elif not strict:
86
+ bypass = gate_bypassed()
87
+ print(str(bypass)) # Also print bypass code to stdout
88
+ print(f"PUBGUARD: {bypass.message}", file=sys.stderr)
89
+ sys.exit(0)
90
+ else:
91
+ # Non-fatal flag (e.g. AI detection, toxicity) — warn but pass
92
+ print(f"PUBGUARD: WARNING (non-fatal flag, proceeding)", file=sys.stderr)
93
+ sys.exit(0)
94
+
95
+
96
+ if __name__ == "__main__":
97
+ main()