File size: 2,063 Bytes
5dbd484
 
b91194e
5dbd484
 
b91194e
 
 
5dbd484
b91194e
 
5dbd484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b91194e
5dbd484
 
 
 
 
 
 
 
 
 
 
 
b91194e
5dbd484
 
b91194e
 
 
 
 
 
 
 
 
5dbd484
b91194e
5dbd484
 
b91194e
5dbd484
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python3
"""
PubGuard gate for pipeline integration.

Reads extracted PDF text from stdin or a file, screens it, and:
  - Prints verdict JSON to STDERR (for debugging)
  - Prints PASS/FAIL to STDERR
  - Exits 0 (pass) or 1 (fail)

Usage:
    echo "$PDF_TEXT" | python3 pub_check/scripts/pubguard_gate.py

Environment variables:
    PUBGUARD_MODELS_DIR  – Override models directory
    PUBGUARD_STRICT      – Set to "0" to warn instead of gate (exit 0 always)
"""

import json
import sys
import os
import logging

logging.basicConfig(
    level=logging.WARNING,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S",
)

from pubguard import PubGuard, PubGuardConfig


def main():
    if len(sys.argv) > 1 and sys.argv[1] != "-":
        with open(sys.argv[1], errors="replace") as f:
            text = f.read()
    else:
        text = sys.stdin.read()

    if not text.strip():
        print("PUBGUARD: Empty input", file=sys.stderr)
        sys.exit(1)

    config = PubGuardConfig()
    strict = os.environ.get("PUBGUARD_STRICT", "1") != "0"

    guard = PubGuard(config=config)
    guard.initialize()
    verdict = guard.screen(text)

    print(json.dumps(verdict), file=sys.stderr)

    if verdict["pass"]:
        print("PUBGUARD: PASS", file=sys.stderr)
        sys.exit(0)
    else:
        reasons = []
        if verdict["doc_type"]["label"] != "scientific_paper":
            reasons.append(f"doc_type={verdict['doc_type']['label']}")
        if verdict["ai_generated"]["label"] == "ai_generated":
            reasons.append(f"ai_generated (score={verdict['ai_generated']['score']:.2f})")
        if verdict["toxicity"]["label"] == "toxic":
            reasons.append(f"toxic (score={verdict['toxicity']['score']:.2f})")

        print(f"PUBGUARD: FAIL — {', '.join(reasons)}", file=sys.stderr)

        if strict:
            sys.exit(1)
        else:
            print("PUBGUARD: Running in non-strict mode, continuing...", file=sys.stderr)
            sys.exit(0)


if __name__ == "__main__":
    main()