jimnoneill's picture
Upload src/pubguard/errors.py with huggingface_hub
3e68bea verified
raw
history blame
10.1 kB
"""
PubVerse Error Code System
==========================
Structured error codes for the entire PubVerse pipeline.
PubGuard codes (PV-0XXX) encode classifier predictions directly
into the code digits.
Error code format: PV-SXNN
S = Step number (0-8)
X = Sub-category
NN = Detail
PubGuard composite encoding (Step 0):
PV-0 [doc_type] [ai_detect] [toxicity]
0=paper 0=human 0=clean
1=poster 1=ai 1=toxic
2=abstract
3=junk
"""
from dataclasses import dataclass
from typing import Dict, Any, Optional
# ── PubGuard (Step 0) error messages ─────────────────────────────
# Snarky messages keyed by doc_type classification
DOC_TYPE_MESSAGES = {
"scientific_paper": "Welcome to the lab.",
"poster": (
"That's a poster, not a paper. We appreciate the aesthetic effort, "
"but we need Methods, not bullet points on a corkboard."
),
"abstract_only": (
"We got the trailer but not the movie. "
"Where's the rest of the paper?"
),
"junk": (
"That's not a paper, that's a cry for help. Pool party invitations, "
"invoices, and fantasy football drafts do not constitute peer-reviewed research."
),
}
AI_DETECT_MESSAGES = {
"human": None, # No message needed
"ai_generated": (
"Our classifier thinks a robot wrote this. "
"The Turing test starts at the Introduction."
),
}
TOXICITY_MESSAGES = {
"clean": None,
"toxic": (
"Content flagged as potentially toxic. "
"Science should be provocative, not offensive."
),
}
# Special composite messages for particularly entertaining combos
COMBO_MESSAGES = {
(3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
(3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
(3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
(1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
(2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
}
# Class label β†’ index mapping (matches config.py label order)
DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
TOXICITY_INDEX = {"clean": 0, "toxic": 1}
@dataclass
class PubVerseError:
"""Structured pipeline error code."""
code: str # e.g. "PV-0300"
name: str # e.g. "JUNK_DETECTED"
message: str # Human-readable (snarky) description
step: int # Pipeline step number
fatal: bool # Whether this should halt the pipeline
details: Optional[Dict[str, Any]] = None # Optional scores, labels, etc.
def __str__(self) -> str:
return f"{self.code} | {self.name} | {self.message}"
def to_dict(self) -> Dict[str, Any]:
d = {
"code": self.code,
"name": self.name,
"message": self.message,
"step": self.step,
"fatal": self.fatal,
}
if self.details:
d["details"] = self.details
return d
def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
"""
Build a PubGuard error code from a screening verdict.
The code encodes the classifier predictions:
PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]
Returns PV-0000 (ALL_CLEAR) if the paper passes.
"""
dt_label = verdict["doc_type"]["label"]
ai_label = verdict["ai_generated"]["label"]
tx_label = verdict["toxicity"]["label"]
dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
tx_idx = TOXICITY_INDEX.get(tx_label, 9)
code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"
# Build name
if verdict["pass"]:
name = "ALL_CLEAR"
else:
parts = []
if dt_idx > 0:
parts.append(dt_label.upper())
if ai_idx > 0:
parts.append("AI_GENERATED")
if tx_idx > 0:
parts.append("TOXIC")
name = "_AND_".join(parts) if parts else "REJECTED"
# Build message β€” check combo messages first, then individual
combo_key = (dt_idx, ai_idx, tx_idx)
if combo_key in COMBO_MESSAGES:
message = COMBO_MESSAGES[combo_key]
elif dt_idx > 0:
message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
elif ai_idx > 0:
message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
elif tx_idx > 0:
message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
else:
message = "Welcome to the lab."
# Add scores to message
score_parts = []
if dt_idx > 0:
score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
if ai_idx > 0:
score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
if tx_idx > 0:
score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")
if score_parts:
message += f" ({', '.join(score_parts)})"
# Fatal = doc_type is not scientific_paper (hard gate)
fatal = dt_idx > 0
details = {
"doc_type": verdict["doc_type"],
"ai_generated": verdict["ai_generated"],
"toxicity": verdict["toxicity"],
}
return PubVerseError(
code=code,
name=name,
message=message,
step=0,
fatal=fatal,
details=details,
)
# ── Special PubGuard errors ──────────────────────────────────────
def empty_input_error() -> PubVerseError:
return PubVerseError(
code="PV-0900",
name="EMPTY_INPUT",
message=(
"You sent us nothing. Literally nothing. "
"The void does not require peer review."
),
step=0,
fatal=True,
)
def unreadable_pdf_error(filename: str = "") -> PubVerseError:
return PubVerseError(
code="PV-0901",
name="UNREADABLE_PDF",
message=(
f"We can't read this PDF{f' ({filename})' if filename else ''}. "
"If your PDF parser can't parse it, maybe it's not a PDF."
),
step=0,
fatal=True,
)
def models_missing_error() -> PubVerseError:
return PubVerseError(
code="PV-0902",
name="MODELS_MISSING",
message=(
"PubGuard models not found. "
"Run: cd pub_check && python scripts/train_pubguard.py"
),
step=0,
fatal=False, # Pipeline can continue without PubGuard
)
def gate_bypassed() -> PubVerseError:
return PubVerseError(
code="PV-0999",
name="GATE_BYPASSED",
message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
step=0,
fatal=False,
)
# ── Pipeline step errors (Steps 1-8) ────────────────────────────
def pipeline_error(step: int, sub: int, detail: int,
name: str, message: str, fatal: bool = True) -> PubVerseError:
"""Create a pipeline error for steps 1-8."""
code = f"PV-{step}{sub}{detail:02d}"
return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)
# Pre-built pipeline errors for bash scripts to reference by name
PIPELINE_ERRORS = {
# Step 1 β€” Feature Extraction
"PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
"PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
"PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
# Step 2 β€” PubVerse Analysis
"PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one β€” check the logs.", True),
# Step 3 β€” Artifact Verification
"PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
"PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
# Step 4 β€” Graph Construction
"PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
# Step 5 β€” 42DeepThought Scoring
"PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
"PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
"PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
"PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
# Step 6 β€” Cluster Analysis
"PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner β€” or the code is.", False),
"PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
"PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
# Step 7 β€” Enrichment
"PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
# Step 8 β€” Visualization
"PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
}
def format_error_line(code: str, name: str = None, message: str = None) -> str:
"""Format a single error line for stdout output."""
if name is None or message is None:
if code in PIPELINE_ERRORS:
name, message, _ = PIPELINE_ERRORS[code]
else:
name = name or "UNKNOWN"
message = message or "An error occurred."
return f"{code} | {name} | {message}"