Remove errors.py (internal only)
Browse files- src/pubguard/errors.py +0 -274
src/pubguard/errors.py
DELETED
|
@@ -1,274 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
PubVerse Error Code System
|
| 3 |
-
==========================
|
| 4 |
-
|
| 5 |
-
Structured error codes for the entire PubVerse pipeline.
|
| 6 |
-
PubGuard codes (PV-0XXX) encode classifier predictions directly
|
| 7 |
-
into the code digits.
|
| 8 |
-
|
| 9 |
-
Error code format: PV-SXNN
|
| 10 |
-
S = Step number (0-8)
|
| 11 |
-
X = Sub-category
|
| 12 |
-
NN = Detail
|
| 13 |
-
|
| 14 |
-
PubGuard composite encoding (Step 0):
|
| 15 |
-
PV-0 [doc_type] [ai_detect] [toxicity]
|
| 16 |
-
0=paper 0=human 0=clean
|
| 17 |
-
1=poster 1=ai 1=toxic
|
| 18 |
-
2=abstract
|
| 19 |
-
3=junk
|
| 20 |
-
"""
|
| 21 |
-
|
| 22 |
-
from dataclasses import dataclass
|
| 23 |
-
from typing import Dict, Any, Optional
|
| 24 |
-
|
| 25 |
-
# ββ PubGuard (Step 0) error messages βββββββββββββββββββββββββββββ
|
| 26 |
-
|
| 27 |
-
# Snarky messages keyed by doc_type classification
|
| 28 |
-
DOC_TYPE_MESSAGES = {
|
| 29 |
-
"scientific_paper": "Welcome to the lab.",
|
| 30 |
-
"poster": (
|
| 31 |
-
"That's a poster, not a paper. We appreciate the aesthetic effort, "
|
| 32 |
-
"but we need Methods, not bullet points on a corkboard."
|
| 33 |
-
),
|
| 34 |
-
"abstract_only": (
|
| 35 |
-
"We got the trailer but not the movie. "
|
| 36 |
-
"Where's the rest of the paper?"
|
| 37 |
-
),
|
| 38 |
-
"junk": (
|
| 39 |
-
"That's not a paper, that's a cry for help. Pool party invitations, "
|
| 40 |
-
"invoices, and fantasy football drafts do not constitute peer-reviewed research."
|
| 41 |
-
),
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
AI_DETECT_MESSAGES = {
|
| 45 |
-
"human": None, # No message needed
|
| 46 |
-
"ai_generated": (
|
| 47 |
-
"Our classifier thinks a robot wrote this. "
|
| 48 |
-
"The Turing test starts at the Introduction."
|
| 49 |
-
),
|
| 50 |
-
}
|
| 51 |
-
|
| 52 |
-
TOXICITY_MESSAGES = {
|
| 53 |
-
"clean": None,
|
| 54 |
-
"toxic": (
|
| 55 |
-
"Content flagged as potentially toxic. "
|
| 56 |
-
"Science should be provocative, not offensive."
|
| 57 |
-
),
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
# Special composite messages for particularly entertaining combos
|
| 61 |
-
COMBO_MESSAGES = {
|
| 62 |
-
(3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
|
| 63 |
-
(3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
|
| 64 |
-
(3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
|
| 65 |
-
(1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
|
| 66 |
-
(2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
# Class label β index mapping (matches config.py label order)
|
| 70 |
-
DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
|
| 71 |
-
AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
|
| 72 |
-
TOXICITY_INDEX = {"clean": 0, "toxic": 1}
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
@dataclass
|
| 76 |
-
class PubVerseError:
|
| 77 |
-
"""Structured pipeline error code."""
|
| 78 |
-
code: str # e.g. "PV-0300"
|
| 79 |
-
name: str # e.g. "JUNK_DETECTED"
|
| 80 |
-
message: str # Human-readable (snarky) description
|
| 81 |
-
step: int # Pipeline step number
|
| 82 |
-
fatal: bool # Whether this should halt the pipeline
|
| 83 |
-
details: Optional[Dict[str, Any]] = None # Optional scores, labels, etc.
|
| 84 |
-
|
| 85 |
-
def __str__(self) -> str:
|
| 86 |
-
return f"{self.code} | {self.name} | {self.message}"
|
| 87 |
-
|
| 88 |
-
def to_dict(self) -> Dict[str, Any]:
|
| 89 |
-
d = {
|
| 90 |
-
"code": self.code,
|
| 91 |
-
"name": self.name,
|
| 92 |
-
"message": self.message,
|
| 93 |
-
"step": self.step,
|
| 94 |
-
"fatal": self.fatal,
|
| 95 |
-
}
|
| 96 |
-
if self.details:
|
| 97 |
-
d["details"] = self.details
|
| 98 |
-
return d
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
|
| 102 |
-
"""
|
| 103 |
-
Build a PubGuard error code from a screening verdict.
|
| 104 |
-
|
| 105 |
-
The code encodes the classifier predictions:
|
| 106 |
-
PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]
|
| 107 |
-
|
| 108 |
-
Returns PV-0000 (ALL_CLEAR) if the paper passes.
|
| 109 |
-
"""
|
| 110 |
-
dt_label = verdict["doc_type"]["label"]
|
| 111 |
-
ai_label = verdict["ai_generated"]["label"]
|
| 112 |
-
tx_label = verdict["toxicity"]["label"]
|
| 113 |
-
|
| 114 |
-
dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
|
| 115 |
-
ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
|
| 116 |
-
tx_idx = TOXICITY_INDEX.get(tx_label, 9)
|
| 117 |
-
|
| 118 |
-
code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"
|
| 119 |
-
|
| 120 |
-
# Build name
|
| 121 |
-
if verdict["pass"]:
|
| 122 |
-
name = "ALL_CLEAR"
|
| 123 |
-
else:
|
| 124 |
-
parts = []
|
| 125 |
-
if dt_idx > 0:
|
| 126 |
-
parts.append(dt_label.upper())
|
| 127 |
-
if ai_idx > 0:
|
| 128 |
-
parts.append("AI_GENERATED")
|
| 129 |
-
if tx_idx > 0:
|
| 130 |
-
parts.append("TOXIC")
|
| 131 |
-
name = "_AND_".join(parts) if parts else "REJECTED"
|
| 132 |
-
|
| 133 |
-
# Build message β check combo messages first, then individual
|
| 134 |
-
combo_key = (dt_idx, ai_idx, tx_idx)
|
| 135 |
-
if combo_key in COMBO_MESSAGES:
|
| 136 |
-
message = COMBO_MESSAGES[combo_key]
|
| 137 |
-
elif dt_idx > 0:
|
| 138 |
-
message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
|
| 139 |
-
elif ai_idx > 0:
|
| 140 |
-
message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
|
| 141 |
-
elif tx_idx > 0:
|
| 142 |
-
message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
|
| 143 |
-
else:
|
| 144 |
-
message = "Welcome to the lab."
|
| 145 |
-
|
| 146 |
-
# Add scores to message
|
| 147 |
-
score_parts = []
|
| 148 |
-
if dt_idx > 0:
|
| 149 |
-
score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
|
| 150 |
-
if ai_idx > 0:
|
| 151 |
-
score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
|
| 152 |
-
if tx_idx > 0:
|
| 153 |
-
score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")
|
| 154 |
-
|
| 155 |
-
if score_parts:
|
| 156 |
-
message += f" ({', '.join(score_parts)})"
|
| 157 |
-
|
| 158 |
-
# Fatal = doc_type is not scientific_paper (hard gate)
|
| 159 |
-
fatal = dt_idx > 0
|
| 160 |
-
|
| 161 |
-
details = {
|
| 162 |
-
"doc_type": verdict["doc_type"],
|
| 163 |
-
"ai_generated": verdict["ai_generated"],
|
| 164 |
-
"toxicity": verdict["toxicity"],
|
| 165 |
-
}
|
| 166 |
-
|
| 167 |
-
return PubVerseError(
|
| 168 |
-
code=code,
|
| 169 |
-
name=name,
|
| 170 |
-
message=message,
|
| 171 |
-
step=0,
|
| 172 |
-
fatal=fatal,
|
| 173 |
-
details=details,
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
# ββ Special PubGuard errors ββββββββββββββββββββββββββββββββββββββ
|
| 178 |
-
|
| 179 |
-
def empty_input_error() -> PubVerseError:
|
| 180 |
-
return PubVerseError(
|
| 181 |
-
code="PV-0900",
|
| 182 |
-
name="EMPTY_INPUT",
|
| 183 |
-
message=(
|
| 184 |
-
"You sent us nothing. Literally nothing. "
|
| 185 |
-
"The void does not require peer review."
|
| 186 |
-
),
|
| 187 |
-
step=0,
|
| 188 |
-
fatal=True,
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
def unreadable_pdf_error(filename: str = "") -> PubVerseError:
|
| 193 |
-
return PubVerseError(
|
| 194 |
-
code="PV-0901",
|
| 195 |
-
name="UNREADABLE_PDF",
|
| 196 |
-
message=(
|
| 197 |
-
f"We can't read this PDF{f' ({filename})' if filename else ''}. "
|
| 198 |
-
"If your PDF parser can't parse it, maybe it's not a PDF."
|
| 199 |
-
),
|
| 200 |
-
step=0,
|
| 201 |
-
fatal=True,
|
| 202 |
-
)
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
def models_missing_error() -> PubVerseError:
|
| 206 |
-
return PubVerseError(
|
| 207 |
-
code="PV-0902",
|
| 208 |
-
name="MODELS_MISSING",
|
| 209 |
-
message=(
|
| 210 |
-
"PubGuard models not found. "
|
| 211 |
-
"Run: cd pub_check && python scripts/train_pubguard.py"
|
| 212 |
-
),
|
| 213 |
-
step=0,
|
| 214 |
-
fatal=False, # Pipeline can continue without PubGuard
|
| 215 |
-
)
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
def gate_bypassed() -> PubVerseError:
|
| 219 |
-
return PubVerseError(
|
| 220 |
-
code="PV-0999",
|
| 221 |
-
name="GATE_BYPASSED",
|
| 222 |
-
message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
|
| 223 |
-
step=0,
|
| 224 |
-
fatal=False,
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
# ββ Pipeline step errors (Steps 1-8) ββββββββββββββββββββββββββββ
|
| 229 |
-
|
| 230 |
-
def pipeline_error(step: int, sub: int, detail: int,
|
| 231 |
-
name: str, message: str, fatal: bool = True) -> PubVerseError:
|
| 232 |
-
"""Create a pipeline error for steps 1-8."""
|
| 233 |
-
code = f"PV-{step}{sub}{detail:02d}"
|
| 234 |
-
return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
# Pre-built pipeline errors for bash scripts to reference by name
|
| 238 |
-
PIPELINE_ERRORS = {
|
| 239 |
-
# Step 1 β Feature Extraction
|
| 240 |
-
"PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
|
| 241 |
-
"PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
|
| 242 |
-
"PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
|
| 243 |
-
# Step 2 β PubVerse Analysis
|
| 244 |
-
"PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one β check the logs.", True),
|
| 245 |
-
# Step 3 β Artifact Verification
|
| 246 |
-
"PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
|
| 247 |
-
"PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
|
| 248 |
-
# Step 4 β Graph Construction
|
| 249 |
-
"PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
|
| 250 |
-
# Step 5 β 42DeepThought Scoring
|
| 251 |
-
"PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
|
| 252 |
-
"PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
|
| 253 |
-
"PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
|
| 254 |
-
"PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
|
| 255 |
-
# Step 6 β Cluster Analysis
|
| 256 |
-
"PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner β or the code is.", False),
|
| 257 |
-
"PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
|
| 258 |
-
"PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
|
| 259 |
-
# Step 7 β Enrichment
|
| 260 |
-
"PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
|
| 261 |
-
# Step 8 β Visualization
|
| 262 |
-
"PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
def format_error_line(code: str, name: str = None, message: str = None) -> str:
|
| 267 |
-
"""Format a single error line for stdout output."""
|
| 268 |
-
if name is None or message is None:
|
| 269 |
-
if code in PIPELINE_ERRORS:
|
| 270 |
-
name, message, _ = PIPELINE_ERRORS[code]
|
| 271 |
-
else:
|
| 272 |
-
name = name or "UNKNOWN"
|
| 273 |
-
message = message or "An error occurred."
|
| 274 |
-
return f"{code} | {name} | {message}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|