jimnoneill commited on
Commit
d8edf55
Β·
verified Β·
1 Parent(s): ec0dccd

Remove errors.py (internal only)

Browse files
Files changed (1) hide show
  1. src/pubguard/errors.py +0 -274
src/pubguard/errors.py DELETED
@@ -1,274 +0,0 @@
1
- """
2
- PubVerse Error Code System
3
- ==========================
4
-
5
- Structured error codes for the entire PubVerse pipeline.
6
- PubGuard codes (PV-0XXX) encode classifier predictions directly
7
- into the code digits.
8
-
9
- Error code format: PV-SXNN
10
- S = Step number (0-8)
11
- X = Sub-category
12
- NN = Detail
13
-
14
- PubGuard composite encoding (Step 0):
15
- PV-0 [doc_type] [ai_detect] [toxicity]
16
- 0=paper 0=human 0=clean
17
- 1=poster 1=ai 1=toxic
18
- 2=abstract
19
- 3=junk
20
- """
21
-
22
- from dataclasses import dataclass
23
- from typing import Dict, Any, Optional
24
-
25
- # ── PubGuard (Step 0) error messages ─────────────────────────────
26
-
27
- # Snarky messages keyed by doc_type classification
28
- DOC_TYPE_MESSAGES = {
29
- "scientific_paper": "Welcome to the lab.",
30
- "poster": (
31
- "That's a poster, not a paper. We appreciate the aesthetic effort, "
32
- "but we need Methods, not bullet points on a corkboard."
33
- ),
34
- "abstract_only": (
35
- "We got the trailer but not the movie. "
36
- "Where's the rest of the paper?"
37
- ),
38
- "junk": (
39
- "That's not a paper, that's a cry for help. Pool party invitations, "
40
- "invoices, and fantasy football drafts do not constitute peer-reviewed research."
41
- ),
42
- }
43
-
44
- AI_DETECT_MESSAGES = {
45
- "human": None, # No message needed
46
- "ai_generated": (
47
- "Our classifier thinks a robot wrote this. "
48
- "The Turing test starts at the Introduction."
49
- ),
50
- }
51
-
52
- TOXICITY_MESSAGES = {
53
- "clean": None,
54
- "toxic": (
55
- "Content flagged as potentially toxic. "
56
- "Science should be provocative, not offensive."
57
- ),
58
- }
59
-
60
- # Special composite messages for particularly entertaining combos
61
- COMBO_MESSAGES = {
62
- (3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
63
- (3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
64
- (3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
65
- (1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
66
- (2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
67
- }
68
-
69
- # Class label β†’ index mapping (matches config.py label order)
70
- DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
71
- AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
72
- TOXICITY_INDEX = {"clean": 0, "toxic": 1}
73
-
74
-
75
- @dataclass
76
- class PubVerseError:
77
- """Structured pipeline error code."""
78
- code: str # e.g. "PV-0300"
79
- name: str # e.g. "JUNK_DETECTED"
80
- message: str # Human-readable (snarky) description
81
- step: int # Pipeline step number
82
- fatal: bool # Whether this should halt the pipeline
83
- details: Optional[Dict[str, Any]] = None # Optional scores, labels, etc.
84
-
85
- def __str__(self) -> str:
86
- return f"{self.code} | {self.name} | {self.message}"
87
-
88
- def to_dict(self) -> Dict[str, Any]:
89
- d = {
90
- "code": self.code,
91
- "name": self.name,
92
- "message": self.message,
93
- "step": self.step,
94
- "fatal": self.fatal,
95
- }
96
- if self.details:
97
- d["details"] = self.details
98
- return d
99
-
100
-
101
- def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
102
- """
103
- Build a PubGuard error code from a screening verdict.
104
-
105
- The code encodes the classifier predictions:
106
- PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]
107
-
108
- Returns PV-0000 (ALL_CLEAR) if the paper passes.
109
- """
110
- dt_label = verdict["doc_type"]["label"]
111
- ai_label = verdict["ai_generated"]["label"]
112
- tx_label = verdict["toxicity"]["label"]
113
-
114
- dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
115
- ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
116
- tx_idx = TOXICITY_INDEX.get(tx_label, 9)
117
-
118
- code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"
119
-
120
- # Build name
121
- if verdict["pass"]:
122
- name = "ALL_CLEAR"
123
- else:
124
- parts = []
125
- if dt_idx > 0:
126
- parts.append(dt_label.upper())
127
- if ai_idx > 0:
128
- parts.append("AI_GENERATED")
129
- if tx_idx > 0:
130
- parts.append("TOXIC")
131
- name = "_AND_".join(parts) if parts else "REJECTED"
132
-
133
- # Build message β€” check combo messages first, then individual
134
- combo_key = (dt_idx, ai_idx, tx_idx)
135
- if combo_key in COMBO_MESSAGES:
136
- message = COMBO_MESSAGES[combo_key]
137
- elif dt_idx > 0:
138
- message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
139
- elif ai_idx > 0:
140
- message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
141
- elif tx_idx > 0:
142
- message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
143
- else:
144
- message = "Welcome to the lab."
145
-
146
- # Add scores to message
147
- score_parts = []
148
- if dt_idx > 0:
149
- score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
150
- if ai_idx > 0:
151
- score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
152
- if tx_idx > 0:
153
- score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")
154
-
155
- if score_parts:
156
- message += f" ({', '.join(score_parts)})"
157
-
158
- # Fatal = doc_type is not scientific_paper (hard gate)
159
- fatal = dt_idx > 0
160
-
161
- details = {
162
- "doc_type": verdict["doc_type"],
163
- "ai_generated": verdict["ai_generated"],
164
- "toxicity": verdict["toxicity"],
165
- }
166
-
167
- return PubVerseError(
168
- code=code,
169
- name=name,
170
- message=message,
171
- step=0,
172
- fatal=fatal,
173
- details=details,
174
- )
175
-
176
-
177
- # ── Special PubGuard errors ──────────────────────────────────────
178
-
179
- def empty_input_error() -> PubVerseError:
180
- return PubVerseError(
181
- code="PV-0900",
182
- name="EMPTY_INPUT",
183
- message=(
184
- "You sent us nothing. Literally nothing. "
185
- "The void does not require peer review."
186
- ),
187
- step=0,
188
- fatal=True,
189
- )
190
-
191
-
192
- def unreadable_pdf_error(filename: str = "") -> PubVerseError:
193
- return PubVerseError(
194
- code="PV-0901",
195
- name="UNREADABLE_PDF",
196
- message=(
197
- f"We can't read this PDF{f' ({filename})' if filename else ''}. "
198
- "If your PDF parser can't parse it, maybe it's not a PDF."
199
- ),
200
- step=0,
201
- fatal=True,
202
- )
203
-
204
-
205
- def models_missing_error() -> PubVerseError:
206
- return PubVerseError(
207
- code="PV-0902",
208
- name="MODELS_MISSING",
209
- message=(
210
- "PubGuard models not found. "
211
- "Run: cd pub_check && python scripts/train_pubguard.py"
212
- ),
213
- step=0,
214
- fatal=False, # Pipeline can continue without PubGuard
215
- )
216
-
217
-
218
- def gate_bypassed() -> PubVerseError:
219
- return PubVerseError(
220
- code="PV-0999",
221
- name="GATE_BYPASSED",
222
- message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
223
- step=0,
224
- fatal=False,
225
- )
226
-
227
-
228
- # ── Pipeline step errors (Steps 1-8) ────────────────────────────
229
-
230
- def pipeline_error(step: int, sub: int, detail: int,
231
- name: str, message: str, fatal: bool = True) -> PubVerseError:
232
- """Create a pipeline error for steps 1-8."""
233
- code = f"PV-{step}{sub}{detail:02d}"
234
- return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)
235
-
236
-
237
- # Pre-built pipeline errors for bash scripts to reference by name
238
- PIPELINE_ERRORS = {
239
- # Step 1 β€” Feature Extraction
240
- "PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
241
- "PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
242
- "PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
243
- # Step 2 β€” PubVerse Analysis
244
- "PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one β€” check the logs.", True),
245
- # Step 3 β€” Artifact Verification
246
- "PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
247
- "PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
248
- # Step 4 β€” Graph Construction
249
- "PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
250
- # Step 5 β€” 42DeepThought Scoring
251
- "PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
252
- "PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
253
- "PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
254
- "PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
255
- # Step 6 β€” Cluster Analysis
256
- "PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner β€” or the code is.", False),
257
- "PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
258
- "PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
259
- # Step 7 β€” Enrichment
260
- "PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
261
- # Step 8 β€” Visualization
262
- "PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
263
- }
264
-
265
-
266
- def format_error_line(code: str, name: str = None, message: str = None) -> str:
267
- """Format a single error line for stdout output."""
268
- if name is None or message is None:
269
- if code in PIPELINE_ERRORS:
270
- name, message, _ = PIPELINE_ERRORS[code]
271
- else:
272
- name = name or "UNKNOWN"
273
- message = message or "An error occurred."
274
- return f"{code} | {name} | {message}"