jimnoneill commited on
Commit
e1fe580
·
verified ·
1 Parent(s): 0bf5290

Upload src/pubguard/__init__.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/pubguard/__init__.py +59 -0
src/pubguard/__init__.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PubGuard — Scientific Publication Gatekeeper
3
+ =============================================
4
+
5
+ Multi-head document classifier for the PubVerse pipeline.
6
+ Determines whether extracted PDF text represents a genuine
7
+ scientific publication vs. junk, and flags AI-generated or
8
+ offensive content.
9
+
10
+ Classification heads:
11
+ 1. doc_type – scientific_paper | poster | abstract_only | junk
12
+ 2. ai_detect – human | ai_generated
13
+ 3. toxicity – clean | toxic
14
+
15
+ Architecture mirrors openalex-topic-classifier:
16
+ model2vec (StaticModel) → L2-normalised embeddings → per-head
17
+ linear classifiers (sklearn / small torch heads) stored as
18
+ numpy weight matrices for zero-dependency inference.
19
+
20
+ Usage:
21
+ from pubguard import PubGuard
22
+
23
+ guard = PubGuard()
24
+ guard.initialize()
25
+ verdict = guard.screen(text)
26
+ # verdict = {
27
+ # 'doc_type': {'label': 'scientific_paper', 'score': 0.94},
28
+ # 'ai_generated': {'label': 'human', 'score': 0.87},
29
+ # 'toxicity': {'label': 'clean', 'score': 0.99},
30
+ # 'pass': True
31
+ # }
32
+ """
33
+
34
+ from .classifier import PubGuard
35
+ from .config import PubGuardConfig
36
+ from .errors import (
37
+ PubVerseError,
38
+ build_pubguard_error,
39
+ empty_input_error,
40
+ unreadable_pdf_error,
41
+ models_missing_error,
42
+ gate_bypassed,
43
+ format_error_line,
44
+ PIPELINE_ERRORS,
45
+ )
46
+
47
+ __version__ = "0.1.0"
48
+ __all__ = [
49
+ "PubGuard",
50
+ "PubGuardConfig",
51
+ "PubVerseError",
52
+ "build_pubguard_error",
53
+ "empty_input_error",
54
+ "unreadable_pdf_error",
55
+ "models_missing_error",
56
+ "gate_bypassed",
57
+ "format_error_line",
58
+ "PIPELINE_ERRORS",
59
+ ]