Upload src/pubguard/errors.py with huggingface_hub

3e68bea verified 16 days ago

10.1 kB

	"""
	PubVerse Error Code System
	==========================

	Structured error codes for the entire PubVerse pipeline.
	PubGuard codes (PV-0XXX) encode classifier predictions directly
	into the code digits.

	Error code format: PV-SXNN
	S = Step number (0-8)
	X = Sub-category
	NN = Detail

	PubGuard composite encoding (Step 0):
	PV-0 [doc_type] [ai_detect] [toxicity]
	0=paper 0=human 0=clean
	1=poster 1=ai 1=toxic
	2=abstract
	3=junk
	"""

	from dataclasses import dataclass
	from typing import Dict, Any, Optional

	# ── PubGuard (Step 0) error messages ─────────────────────────────

	# Snarky messages keyed by doc_type classification
	DOC_TYPE_MESSAGES = {
	"scientific_paper": "Welcome to the lab.",
	"poster": (
	"That's a poster, not a paper. We appreciate the aesthetic effort, "
	"but we need Methods, not bullet points on a corkboard."
	),
	"abstract_only": (
	"We got the trailer but not the movie. "
	"Where's the rest of the paper?"
	),
	"junk": (
	"That's not a paper, that's a cry for help. Pool party invitations, "
	"invoices, and fantasy football drafts do not constitute peer-reviewed research."
	),
	}

	AI_DETECT_MESSAGES = {
	"human": None, # No message needed
	"ai_generated": (
	"Our classifier thinks a robot wrote this. "
	"The Turing test starts at the Introduction."
	),
	}

	TOXICITY_MESSAGES = {
	"clean": None,
	"toxic": (
	"Content flagged as potentially toxic. "
	"Science should be provocative, not offensive."
	),
	}

	# Special composite messages for particularly entertaining combos
	COMBO_MESSAGES = {
	(3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
	(3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
	(3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
	(1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
	(2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
	}

	# Class label → index mapping (matches config.py label order)
	DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
	AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
	TOXICITY_INDEX = {"clean": 0, "toxic": 1}


	@dataclass
	class PubVerseError:
	"""Structured pipeline error code."""
	code: str # e.g. "PV-0300"
	name: str # e.g. "JUNK_DETECTED"
	message: str # Human-readable (snarky) description
	step: int # Pipeline step number
	fatal: bool # Whether this should halt the pipeline
	details: Optional[Dict[str, Any]] = None # Optional scores, labels, etc.

	def __str__(self) -> str:
	return f"{self.code} \| {self.name} \| {self.message}"

	def to_dict(self) -> Dict[str, Any]:
	d = {
	"code": self.code,
	"name": self.name,
	"message": self.message,
	"step": self.step,
	"fatal": self.fatal,
	}
	if self.details:
	d["details"] = self.details
	return d


	def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
	"""
	Build a PubGuard error code from a screening verdict.

	The code encodes the classifier predictions:
	PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]

	Returns PV-0000 (ALL_CLEAR) if the paper passes.
	"""
	dt_label = verdict["doc_type"]["label"]
	ai_label = verdict["ai_generated"]["label"]
	tx_label = verdict["toxicity"]["label"]

	dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
	ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
	tx_idx = TOXICITY_INDEX.get(tx_label, 9)

	code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"

	# Build name
	if verdict["pass"]:
	name = "ALL_CLEAR"
	else:
	parts = []
	if dt_idx > 0:
	parts.append(dt_label.upper())
	if ai_idx > 0:
	parts.append("AI_GENERATED")
	if tx_idx > 0:
	parts.append("TOXIC")
	name = "_AND_".join(parts) if parts else "REJECTED"

	# Build message — check combo messages first, then individual
	combo_key = (dt_idx, ai_idx, tx_idx)
	if combo_key in COMBO_MESSAGES:
	message = COMBO_MESSAGES[combo_key]
	elif dt_idx > 0:
	message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
	elif ai_idx > 0:
	message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
	elif tx_idx > 0:
	message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
	else:
	message = "Welcome to the lab."

	# Add scores to message
	score_parts = []
	if dt_idx > 0:
	score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
	if ai_idx > 0:
	score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
	if tx_idx > 0:
	score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")

	if score_parts:
	message += f" ({', '.join(score_parts)})"

	# Fatal = doc_type is not scientific_paper (hard gate)
	fatal = dt_idx > 0

	details = {
	"doc_type": verdict["doc_type"],
	"ai_generated": verdict["ai_generated"],
	"toxicity": verdict["toxicity"],
	}

	return PubVerseError(
	code=code,
	name=name,
	message=message,
	step=0,
	fatal=fatal,
	details=details,
	)


	# ── Special PubGuard errors ──────────────────────────────────────

	def empty_input_error() -> PubVerseError:
	return PubVerseError(
	code="PV-0900",
	name="EMPTY_INPUT",
	message=(
	"You sent us nothing. Literally nothing. "
	"The void does not require peer review."
	),
	step=0,
	fatal=True,
	)


	def unreadable_pdf_error(filename: str = "") -> PubVerseError:
	return PubVerseError(
	code="PV-0901",
	name="UNREADABLE_PDF",
	message=(
	f"We can't read this PDF{f' ({filename})' if filename else ''}. "
	"If your PDF parser can't parse it, maybe it's not a PDF."
	),
	step=0,
	fatal=True,
	)


	def models_missing_error() -> PubVerseError:
	return PubVerseError(
	code="PV-0902",
	name="MODELS_MISSING",
	message=(
	"PubGuard models not found. "
	"Run: cd pub_check && python scripts/train_pubguard.py"
	),
	step=0,
	fatal=False, # Pipeline can continue without PubGuard
	)


	def gate_bypassed() -> PubVerseError:
	return PubVerseError(
	code="PV-0999",
	name="GATE_BYPASSED",
	message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
	step=0,
	fatal=False,
	)


	# ── Pipeline step errors (Steps 1-8) ────────────────────────────

	def pipeline_error(step: int, sub: int, detail: int,
	name: str, message: str, fatal: bool = True) -> PubVerseError:
	"""Create a pipeline error for steps 1-8."""
	code = f"PV-{step}{sub}{detail:02d}"
	return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)


	# Pre-built pipeline errors for bash scripts to reference by name
	PIPELINE_ERRORS = {
	# Step 1 — Feature Extraction
	"PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
	"PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
	"PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
	# Step 2 — PubVerse Analysis
	"PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one — check the logs.", True),
	# Step 3 — Artifact Verification
	"PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
	"PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
	# Step 4 — Graph Construction
	"PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
	# Step 5 — 42DeepThought Scoring
	"PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
	"PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
	"PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
	"PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
	# Step 6 — Cluster Analysis
	"PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner — or the code is.", False),
	"PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
	"PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
	# Step 7 — Enrichment
	"PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
	# Step 8 — Visualization
	"PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
	}


	def format_error_line(code: str, name: str = None, message: str = None) -> str:
	"""Format a single error line for stdout output."""
	if name is None or message is None:
	if code in PIPELINE_ERRORS:
	name, message, _ = PIPELINE_ERRORS[code]
	else:
	name = name or "UNKNOWN"
	message = message or "An error occurred."
	return f"{code} \| {name} \| {message}"