|
|
""" |
|
|
Command-line interface for PubGuard. |
|
|
|
|
|
Usage: |
|
|
# Download datasets and train |
|
|
pubguard train --data-dir ./data |
|
|
|
|
|
# Download datasets only |
|
|
pubguard prepare --data-dir ./data |
|
|
|
|
|
# Screen a text file |
|
|
pubguard screen input.txt |
|
|
|
|
|
# Screen extracted PDF text from stdin |
|
|
cat extracted_text.txt | pubguard screen - |
|
|
|
|
|
# Batch screen NDJSON |
|
|
pubguard batch input.ndjson output.ndjson |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import logging |
|
|
import sys |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
from .classifier import PubGuard |
|
|
from .config import PubGuardConfig |
|
|
|
|
|
|
|
|
def cmd_prepare(args): |
|
|
"""Download and prepare training datasets.""" |
|
|
from .data import prepare_all |
|
|
|
|
|
prepare_all(Path(args.data_dir), n_per_class=args.n_per_class) |
|
|
|
|
|
|
|
|
def cmd_train(args): |
|
|
"""Prepare data (if needed) and train all heads.""" |
|
|
from .data import prepare_all |
|
|
from .train import train_all |
|
|
|
|
|
data_dir = Path(args.data_dir) |
|
|
|
|
|
if args.download: |
|
|
prepare_all(data_dir, n_per_class=args.n_per_class) |
|
|
|
|
|
config = PubGuardConfig() |
|
|
if args.models_dir: |
|
|
config.models_dir = Path(args.models_dir) |
|
|
|
|
|
train_all(data_dir, config=config, test_size=args.test_size) |
|
|
|
|
|
|
|
|
def cmd_screen(args): |
|
|
"""Screen a single document.""" |
|
|
config = PubGuardConfig() |
|
|
if args.models_dir: |
|
|
config.models_dir = Path(args.models_dir) |
|
|
|
|
|
guard = PubGuard(config=config) |
|
|
guard.initialize() |
|
|
|
|
|
if args.input == "-": |
|
|
text = sys.stdin.read() |
|
|
else: |
|
|
text = Path(args.input).read_text(errors="replace") |
|
|
|
|
|
verdict = guard.screen(text) |
|
|
|
|
|
if args.json: |
|
|
print(json.dumps(verdict, indent=2)) |
|
|
else: |
|
|
_print_verdict(verdict) |
|
|
|
|
|
|
|
|
def cmd_batch(args): |
|
|
"""Batch-screen an NDJSON file.""" |
|
|
config = PubGuardConfig() |
|
|
if args.models_dir: |
|
|
config.models_dir = Path(args.models_dir) |
|
|
|
|
|
guard = PubGuard(config=config) |
|
|
guard.initialize() |
|
|
|
|
|
start = time.time() |
|
|
processed = 0 |
|
|
|
|
|
with open(args.input) as fin, open(args.output, "w") as fout: |
|
|
batch_texts = [] |
|
|
batch_records = [] |
|
|
|
|
|
for line in fin: |
|
|
if not line.strip(): |
|
|
continue |
|
|
record = json.loads(line) |
|
|
text = record.get("text", "") or record.get("abstract", "") or "" |
|
|
batch_texts.append(text) |
|
|
batch_records.append(record) |
|
|
|
|
|
if len(batch_texts) >= config.batch_size: |
|
|
verdicts = guard.screen_batch(batch_texts) |
|
|
for rec, verd in zip(batch_records, verdicts): |
|
|
rec["pubguard"] = verd |
|
|
fout.write(json.dumps(rec) + "\n") |
|
|
processed += len(batch_texts) |
|
|
batch_texts, batch_records = [], [] |
|
|
|
|
|
|
|
|
if batch_texts: |
|
|
verdicts = guard.screen_batch(batch_texts) |
|
|
for rec, verd in zip(batch_records, verdicts): |
|
|
rec["pubguard"] = verd |
|
|
fout.write(json.dumps(rec) + "\n") |
|
|
processed += len(batch_texts) |
|
|
|
|
|
elapsed = time.time() - start |
|
|
rate = processed / elapsed if elapsed > 0 else 0 |
|
|
print(f"Screened {processed:,} records in {elapsed:.1f}s ({rate:,.0f} rec/s)") |
|
|
print(f"Output: {args.output}") |
|
|
|
|
|
|
|
|
def _print_verdict(v: dict): |
|
|
"""Pretty-print a verdict.""" |
|
|
pass_icon = "✅" if v["pass"] else "❌" |
|
|
print(f"\n{pass_icon} PubGuard Verdict: {'PASS' if v['pass'] else 'FAIL'}") |
|
|
print(f" Document type: {v['doc_type']['label']:20s} (score: {v['doc_type']['score']:.3f})") |
|
|
print(f" AI detection: {v['ai_generated']['label']:20s} (score: {v['ai_generated']['score']:.3f})") |
|
|
print(f" Toxicity: {v['toxicity']['label']:20s} (score: {v['toxicity']['score']:.3f})") |
|
|
print() |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="PubGuard — Scientific Publication Gatekeeper", |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
) |
|
|
parser.add_argument( |
|
|
"--verbose", "-v", action="store_true", |
|
|
help="Enable verbose logging", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--models-dir", type=str, default=None, |
|
|
help="Override models directory", |
|
|
) |
|
|
|
|
|
subparsers = parser.add_subparsers(dest="command") |
|
|
|
|
|
|
|
|
p_prepare = subparsers.add_parser("prepare", help="Download and prepare datasets") |
|
|
p_prepare.add_argument("--data-dir", default="./pubguard_data") |
|
|
p_prepare.add_argument("--n-per-class", type=int, default=15000) |
|
|
|
|
|
|
|
|
p_train = subparsers.add_parser("train", help="Train classification heads") |
|
|
p_train.add_argument("--data-dir", default="./pubguard_data") |
|
|
p_train.add_argument("--models-dir", default=None) |
|
|
p_train.add_argument("--download", action="store_true", default=True, |
|
|
help="Download datasets before training") |
|
|
p_train.add_argument("--no-download", action="store_false", dest="download") |
|
|
p_train.add_argument("--n-per-class", type=int, default=15000) |
|
|
p_train.add_argument("--test-size", type=float, default=0.15) |
|
|
|
|
|
|
|
|
p_screen = subparsers.add_parser("screen", help="Screen a single document") |
|
|
p_screen.add_argument("input", help="Text file to screen (or - for stdin)") |
|
|
p_screen.add_argument("--json", action="store_true", help="JSON output") |
|
|
|
|
|
|
|
|
p_batch = subparsers.add_parser("batch", help="Batch screen NDJSON") |
|
|
p_batch.add_argument("input", help="Input NDJSON file") |
|
|
p_batch.add_argument("output", help="Output NDJSON file") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
level = logging.DEBUG if args.verbose else logging.INFO |
|
|
logging.basicConfig( |
|
|
level=level, |
|
|
format="%(asctime)s | %(levelname)s | %(message)s", |
|
|
datefmt="%Y-%m-%d %H:%M:%S", |
|
|
) |
|
|
|
|
|
if args.command == "prepare": |
|
|
cmd_prepare(args) |
|
|
elif args.command == "train": |
|
|
cmd_train(args) |
|
|
elif args.command == "screen": |
|
|
cmd_screen(args) |
|
|
elif args.command == "batch": |
|
|
cmd_batch(args) |
|
|
else: |
|
|
parser.print_help() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|