jimnoneill commited on
Commit
3e68bea
Β·
verified Β·
1 Parent(s): 7b17f5e

Upload src/pubguard/errors.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/pubguard/errors.py +274 -0
src/pubguard/errors.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PubVerse Error Code System
3
+ ==========================
4
+
5
+ Structured error codes for the entire PubVerse pipeline.
6
+ PubGuard codes (PV-0XXX) encode classifier predictions directly
7
+ into the code digits.
8
+
9
+ Error code format: PV-SXNN
10
+ S = Step number (0-8)
11
+ X = Sub-category
12
+ NN = Detail
13
+
14
+ PubGuard composite encoding (Step 0):
15
+ PV-0 [doc_type] [ai_detect] [toxicity]
16
+ 0=paper 0=human 0=clean
17
+ 1=poster 1=ai 1=toxic
18
+ 2=abstract
19
+ 3=junk
20
+ """
21
+
22
+ from dataclasses import dataclass
23
+ from typing import Dict, Any, Optional
24
+
25
+ # ── PubGuard (Step 0) error messages ─────────────────────────────
26
+
27
+ # Snarky messages keyed by doc_type classification
28
+ DOC_TYPE_MESSAGES = {
29
+ "scientific_paper": "Welcome to the lab.",
30
+ "poster": (
31
+ "That's a poster, not a paper. We appreciate the aesthetic effort, "
32
+ "but we need Methods, not bullet points on a corkboard."
33
+ ),
34
+ "abstract_only": (
35
+ "We got the trailer but not the movie. "
36
+ "Where's the rest of the paper?"
37
+ ),
38
+ "junk": (
39
+ "That's not a paper, that's a cry for help. Pool party invitations, "
40
+ "invoices, and fantasy football drafts do not constitute peer-reviewed research."
41
+ ),
42
+ }
43
+
44
+ AI_DETECT_MESSAGES = {
45
+ "human": None, # No message needed
46
+ "ai_generated": (
47
+ "Our classifier thinks a robot wrote this. "
48
+ "The Turing test starts at the Introduction."
49
+ ),
50
+ }
51
+
52
+ TOXICITY_MESSAGES = {
53
+ "clean": None,
54
+ "toxic": (
55
+ "Content flagged as potentially toxic. "
56
+ "Science should be provocative, not offensive."
57
+ ),
58
+ }
59
+
60
+ # Special composite messages for particularly entertaining combos
61
+ COMBO_MESSAGES = {
62
+ (3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
63
+ (3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
64
+ (3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
65
+ (1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
66
+ (2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
67
+ }
68
+
69
+ # Class label β†’ index mapping (matches config.py label order)
70
+ DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
71
+ AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
72
+ TOXICITY_INDEX = {"clean": 0, "toxic": 1}
73
+
74
+
75
+ @dataclass
76
+ class PubVerseError:
77
+ """Structured pipeline error code."""
78
+ code: str # e.g. "PV-0300"
79
+ name: str # e.g. "JUNK_DETECTED"
80
+ message: str # Human-readable (snarky) description
81
+ step: int # Pipeline step number
82
+ fatal: bool # Whether this should halt the pipeline
83
+ details: Optional[Dict[str, Any]] = None # Optional scores, labels, etc.
84
+
85
+ def __str__(self) -> str:
86
+ return f"{self.code} | {self.name} | {self.message}"
87
+
88
+ def to_dict(self) -> Dict[str, Any]:
89
+ d = {
90
+ "code": self.code,
91
+ "name": self.name,
92
+ "message": self.message,
93
+ "step": self.step,
94
+ "fatal": self.fatal,
95
+ }
96
+ if self.details:
97
+ d["details"] = self.details
98
+ return d
99
+
100
+
101
+ def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
102
+ """
103
+ Build a PubGuard error code from a screening verdict.
104
+
105
+ The code encodes the classifier predictions:
106
+ PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]
107
+
108
+ Returns PV-0000 (ALL_CLEAR) if the paper passes.
109
+ """
110
+ dt_label = verdict["doc_type"]["label"]
111
+ ai_label = verdict["ai_generated"]["label"]
112
+ tx_label = verdict["toxicity"]["label"]
113
+
114
+ dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
115
+ ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
116
+ tx_idx = TOXICITY_INDEX.get(tx_label, 9)
117
+
118
+ code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"
119
+
120
+ # Build name
121
+ if verdict["pass"]:
122
+ name = "ALL_CLEAR"
123
+ else:
124
+ parts = []
125
+ if dt_idx > 0:
126
+ parts.append(dt_label.upper())
127
+ if ai_idx > 0:
128
+ parts.append("AI_GENERATED")
129
+ if tx_idx > 0:
130
+ parts.append("TOXIC")
131
+ name = "_AND_".join(parts) if parts else "REJECTED"
132
+
133
+ # Build message β€” check combo messages first, then individual
134
+ combo_key = (dt_idx, ai_idx, tx_idx)
135
+ if combo_key in COMBO_MESSAGES:
136
+ message = COMBO_MESSAGES[combo_key]
137
+ elif dt_idx > 0:
138
+ message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
139
+ elif ai_idx > 0:
140
+ message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
141
+ elif tx_idx > 0:
142
+ message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
143
+ else:
144
+ message = "Welcome to the lab."
145
+
146
+ # Add scores to message
147
+ score_parts = []
148
+ if dt_idx > 0:
149
+ score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
150
+ if ai_idx > 0:
151
+ score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
152
+ if tx_idx > 0:
153
+ score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")
154
+
155
+ if score_parts:
156
+ message += f" ({', '.join(score_parts)})"
157
+
158
+ # Fatal = doc_type is not scientific_paper (hard gate)
159
+ fatal = dt_idx > 0
160
+
161
+ details = {
162
+ "doc_type": verdict["doc_type"],
163
+ "ai_generated": verdict["ai_generated"],
164
+ "toxicity": verdict["toxicity"],
165
+ }
166
+
167
+ return PubVerseError(
168
+ code=code,
169
+ name=name,
170
+ message=message,
171
+ step=0,
172
+ fatal=fatal,
173
+ details=details,
174
+ )
175
+
176
+
177
+ # ── Special PubGuard errors ──────────────────────────────────────
178
+
179
+ def empty_input_error() -> PubVerseError:
180
+ return PubVerseError(
181
+ code="PV-0900",
182
+ name="EMPTY_INPUT",
183
+ message=(
184
+ "You sent us nothing. Literally nothing. "
185
+ "The void does not require peer review."
186
+ ),
187
+ step=0,
188
+ fatal=True,
189
+ )
190
+
191
+
192
+ def unreadable_pdf_error(filename: str = "") -> PubVerseError:
193
+ return PubVerseError(
194
+ code="PV-0901",
195
+ name="UNREADABLE_PDF",
196
+ message=(
197
+ f"We can't read this PDF{f' ({filename})' if filename else ''}. "
198
+ "If your PDF parser can't parse it, maybe it's not a PDF."
199
+ ),
200
+ step=0,
201
+ fatal=True,
202
+ )
203
+
204
+
205
+ def models_missing_error() -> PubVerseError:
206
+ return PubVerseError(
207
+ code="PV-0902",
208
+ name="MODELS_MISSING",
209
+ message=(
210
+ "PubGuard models not found. "
211
+ "Run: cd pub_check && python scripts/train_pubguard.py"
212
+ ),
213
+ step=0,
214
+ fatal=False, # Pipeline can continue without PubGuard
215
+ )
216
+
217
+
218
+ def gate_bypassed() -> PubVerseError:
219
+ return PubVerseError(
220
+ code="PV-0999",
221
+ name="GATE_BYPASSED",
222
+ message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
223
+ step=0,
224
+ fatal=False,
225
+ )
226
+
227
+
228
+ # ── Pipeline step errors (Steps 1-8) ────────────────────────────
229
+
230
+ def pipeline_error(step: int, sub: int, detail: int,
231
+ name: str, message: str, fatal: bool = True) -> PubVerseError:
232
+ """Create a pipeline error for steps 1-8."""
233
+ code = f"PV-{step}{sub}{detail:02d}"
234
+ return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)
235
+
236
+
237
+ # Pre-built pipeline errors for bash scripts to reference by name
238
+ PIPELINE_ERRORS = {
239
+ # Step 1 β€” Feature Extraction
240
+ "PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
241
+ "PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
242
+ "PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
243
+ # Step 2 β€” PubVerse Analysis
244
+ "PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one β€” check the logs.", True),
245
+ # Step 3 β€” Artifact Verification
246
+ "PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
247
+ "PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
248
+ # Step 4 β€” Graph Construction
249
+ "PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
250
+ # Step 5 β€” 42DeepThought Scoring
251
+ "PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
252
+ "PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
253
+ "PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
254
+ "PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
255
+ # Step 6 β€” Cluster Analysis
256
+ "PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner β€” or the code is.", False),
257
+ "PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
258
+ "PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
259
+ # Step 7 β€” Enrichment
260
+ "PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
261
+ # Step 8 β€” Visualization
262
+ "PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
263
+ }
264
+
265
+
266
+ def format_error_line(code: str, name: str = None, message: str = None) -> str:
267
+ """Format a single error line for stdout output."""
268
+ if name is None or message is None:
269
+ if code in PIPELINE_ERRORS:
270
+ name, message, _ = PIPELINE_ERRORS[code]
271
+ else:
272
+ name = name or "UNKNOWN"
273
+ message = message or "An error occurred."
274
+ return f"{code} | {name} | {message}"