pub-guard / src /pubguard /__init__.py
jimnoneill's picture
Updated with real poster data support + poster pass-through gate
2cab1ae verified
"""
PubGuard β€” Scientific Publication Gatekeeper
=============================================
Multi-head document classifier for the PubVerse pipeline.
Determines whether extracted PDF text represents a genuine
scientific publication vs. junk, and flags AI-generated or
offensive content.
Classification heads:
1. doc_type – scientific_paper | poster | abstract_only | junk
2. ai_detect – human | ai_generated
3. toxicity – clean | toxic
Architecture mirrors openalex-topic-classifier:
model2vec (StaticModel) β†’ L2-normalised embeddings β†’ per-head
linear classifiers (sklearn / small torch heads) stored as
numpy weight matrices for zero-dependency inference.
Usage:
from pubguard import PubGuard
guard = PubGuard()
guard.initialize()
verdict = guard.screen(text)
# verdict = {
# 'doc_type': {'label': 'scientific_paper', 'score': 0.94},
# 'ai_generated': {'label': 'human', 'score': 0.87},
# 'toxicity': {'label': 'clean', 'score': 0.99},
# 'pass': True
# }
"""
from .classifier import PubGuard
from .config import PubGuardConfig
from .errors import (
PubVerseError,
build_pubguard_error,
empty_input_error,
unreadable_pdf_error,
models_missing_error,
gate_bypassed,
format_error_line,
PIPELINE_ERRORS,
)
__version__ = "0.1.0"
__all__ = [
"PubGuard",
"PubGuardConfig",
"PubVerseError",
"build_pubguard_error",
"empty_input_error",
"unreadable_pdf_error",
"models_missing_error",
"gate_bypassed",
"format_error_line",
"PIPELINE_ERRORS",
]