jimnoneill commited on
Commit
0b39aef
Β·
verified Β·
1 Parent(s): 23c2fec

Upload src/pubguard/config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/pubguard/config.py +135 -0
src/pubguard/config.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for PubGuard classifier.
3
+
4
+ Mirrors openalex_classifier.config with multi-head additions.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional
10
+ import os
11
+
12
+
13
+ def _find_models_dir() -> Path:
14
+ """Locate PubGuard models directory.
15
+
16
+ Checks for 'head_doc_type.npz' to distinguish PubGuard models
17
+ from other model directories (e.g. OpenAlex) that may exist nearby.
18
+ """
19
+ marker = "head_doc_type.npz"
20
+
21
+ if env_dir := os.environ.get("PUBGUARD_MODELS_DIR"):
22
+ path = Path(env_dir)
23
+ if path.exists():
24
+ return path
25
+
26
+ # Package data
27
+ pkg = Path(__file__).parent / "models"
28
+ if (pkg / marker).exists():
29
+ return pkg
30
+
31
+ # CWD
32
+ cwd = Path.cwd() / "pubguard_models"
33
+ if (cwd / marker).exists():
34
+ return cwd
35
+
36
+ # Repo dev path (pub_check/models)
37
+ repo = Path(__file__).parent.parent.parent / "models"
38
+ if (repo / marker).exists():
39
+ return repo
40
+
41
+ # User home (default install location)
42
+ home = Path.home() / ".pubguard" / "models"
43
+ if (home / marker).exists():
44
+ return home
45
+
46
+ # Fallback β€” use home dir even if empty (training will populate it)
47
+ home.mkdir(parents=True, exist_ok=True)
48
+ return home
49
+
50
+
51
+ # ── Label schemas ────────────────────────────────────────────────
52
+
53
+ DOC_TYPE_LABELS: List[str] = [
54
+ "scientific_paper", # Full research article / journal paper
55
+ "poster", # Conference poster (often single-page, visual)
56
+ "abstract_only", # Standalone abstract without full paper body
57
+ "junk", # Flyers, advertisements, non-scholarly PDFs
58
+ ]
59
+
60
+ AI_DETECT_LABELS: List[str] = [
61
+ "human",
62
+ "ai_generated",
63
+ ]
64
+
65
+ TOXICITY_LABELS: List[str] = [
66
+ "clean",
67
+ "toxic",
68
+ ]
69
+
70
+
71
+ @dataclass
72
+ class PubGuardConfig:
73
+ """Runtime configuration for PubGuard."""
74
+
75
+ # ── Embedding backbone ──────────────────────────────────────
76
+ # Re-use the same distilled model you already cache for OpenAlex
77
+ # to avoid downloading a second 50 MB blob. Any model2vec-
78
+ # compatible StaticModel works here.
79
+ model_name: str = "minishlab/potion-base-32M"
80
+ embedding_dim: int = 512 # potion-base-32M output dim
81
+
82
+ # ── Per-head thresholds ─────────────────────────────────────
83
+ # These are posterior-probability thresholds from the softmax
84
+ # head; anything below is "uncertain" and falls back to the
85
+ # majority class. Calibrate on held-out data.
86
+ doc_type_threshold: float = 0.50
87
+ ai_detect_threshold: float = 0.55
88
+ toxicity_threshold: float = 0.50
89
+
90
+ # ── Pipeline gate logic ─────────────────────────────────────
91
+ # The overall `.screen()` returns pass=True only when the
92
+ # doc_type is 'scientific_paper'. AI detection and toxicity
93
+ # are reported but only block when explicitly enabled, since
94
+ # their accuracy (~84%) produces too many false positives for
95
+ # hard-gating on real scientific text.
96
+ require_scientific: bool = True
97
+ block_ai_generated: bool = False # informational by default
98
+ block_toxic: bool = False # informational by default
99
+
100
+ # ── Batch / performance ─────────────────────────────────────
101
+ batch_size: int = 256
102
+ max_text_chars: int = 4000 # Truncate long texts for embedding
103
+
104
+ # ── Paths ───────────────────────────────────────────────────
105
+ models_dir: Optional[Path] = None
106
+
107
+ def __post_init__(self):
108
+ if self.models_dir is None:
109
+ self.models_dir = _find_models_dir()
110
+ self.models_dir = Path(self.models_dir)
111
+
112
+ # Derived paths
113
+ @property
114
+ def distilled_model_path(self) -> Path:
115
+ return self.models_dir / "pubguard-embedding"
116
+
117
+ @property
118
+ def doc_type_head_path(self) -> Path:
119
+ return self.models_dir / "head_doc_type.npz"
120
+
121
+ @property
122
+ def ai_detect_head_path(self) -> Path:
123
+ return self.models_dir / "head_ai_detect.npz"
124
+
125
+ @property
126
+ def toxicity_head_path(self) -> Path:
127
+ return self.models_dir / "head_toxicity.npz"
128
+
129
+ @property
130
+ def label_schemas(self) -> Dict[str, List[str]]:
131
+ return {
132
+ "doc_type": DOC_TYPE_LABELS,
133
+ "ai_detect": AI_DETECT_LABELS,
134
+ "toxicity": TOXICITY_LABELS,
135
+ }