| | """ |
| | PubVerse Error Code System |
| | ========================== |
| | |
| | Structured error codes for the entire PubVerse pipeline. |
| | PubGuard codes (PV-0XXX) encode classifier predictions directly |
| | into the code digits. |
| | |
| | Error code format: PV-SXNN |
| | S = Step number (0-8) |
| | X = Sub-category |
| | NN = Detail |
| | |
| | PubGuard composite encoding (Step 0): |
| | PV-0 [doc_type] [ai_detect] [toxicity] |
| | 0=paper 0=human 0=clean |
| | 1=poster 1=ai 1=toxic |
| | 2=abstract |
| | 3=junk |
| | """ |
| |
|
| | from dataclasses import dataclass |
| | from typing import Dict, Any, Optional |
| |
|
| | |
| |
|
| | |
| | DOC_TYPE_MESSAGES = { |
| | "scientific_paper": "Welcome to the lab.", |
| | "poster": ( |
| | "That's a poster, not a paper. We appreciate the aesthetic effort, " |
| | "but we need Methods, not bullet points on a corkboard." |
| | ), |
| | "abstract_only": ( |
| | "We got the trailer but not the movie. " |
| | "Where's the rest of the paper?" |
| | ), |
| | "junk": ( |
| | "That's not a paper, that's a cry for help. Pool party invitations, " |
| | "invoices, and fantasy football drafts do not constitute peer-reviewed research." |
| | ), |
| | } |
| |
|
| | AI_DETECT_MESSAGES = { |
| | "human": None, |
| | "ai_generated": ( |
| | "Our classifier thinks a robot wrote this. " |
| | "The Turing test starts at the Introduction." |
| | ), |
| | } |
| |
|
| | TOXICITY_MESSAGES = { |
| | "clean": None, |
| | "toxic": ( |
| | "Content flagged as potentially toxic. " |
| | "Science should be provocative, not offensive." |
| | ), |
| | } |
| |
|
| | |
| | COMBO_MESSAGES = { |
| | (3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.", |
| | (3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.", |
| | (3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.", |
| | (1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.", |
| | (2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.", |
| | } |
| |
|
| | |
| | DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3} |
| | AI_DETECT_INDEX = {"human": 0, "ai_generated": 1} |
| | TOXICITY_INDEX = {"clean": 0, "toxic": 1} |
| |
|
| |
|
| | @dataclass |
| | class PubVerseError: |
| | """Structured pipeline error code.""" |
| | code: str |
| | name: str |
| | message: str |
| | step: int |
| | fatal: bool |
| | details: Optional[Dict[str, Any]] = None |
| |
|
| | def __str__(self) -> str: |
| | return f"{self.code} | {self.name} | {self.message}" |
| |
|
| | def to_dict(self) -> Dict[str, Any]: |
| | d = { |
| | "code": self.code, |
| | "name": self.name, |
| | "message": self.message, |
| | "step": self.step, |
| | "fatal": self.fatal, |
| | } |
| | if self.details: |
| | d["details"] = self.details |
| | return d |
| |
|
| |
|
| | def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError: |
| | """ |
| | Build a PubGuard error code from a screening verdict. |
| | |
| | The code encodes the classifier predictions: |
| | PV-0[doc_type_idx][ai_detect_idx][toxicity_idx] |
| | |
| | Returns PV-0000 (ALL_CLEAR) if the paper passes. |
| | """ |
| | dt_label = verdict["doc_type"]["label"] |
| | ai_label = verdict["ai_generated"]["label"] |
| | tx_label = verdict["toxicity"]["label"] |
| |
|
| | dt_idx = DOC_TYPE_INDEX.get(dt_label, 9) |
| | ai_idx = AI_DETECT_INDEX.get(ai_label, 9) |
| | tx_idx = TOXICITY_INDEX.get(tx_label, 9) |
| |
|
| | code = f"PV-0{dt_idx}{ai_idx}{tx_idx}" |
| |
|
| | |
| | if verdict["pass"]: |
| | name = "ALL_CLEAR" |
| | else: |
| | parts = [] |
| | if dt_idx > 0: |
| | parts.append(dt_label.upper()) |
| | if ai_idx > 0: |
| | parts.append("AI_GENERATED") |
| | if tx_idx > 0: |
| | parts.append("TOXIC") |
| | name = "_AND_".join(parts) if parts else "REJECTED" |
| |
|
| | |
| | combo_key = (dt_idx, ai_idx, tx_idx) |
| | if combo_key in COMBO_MESSAGES: |
| | message = COMBO_MESSAGES[combo_key] |
| | elif dt_idx > 0: |
| | message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.") |
| | elif ai_idx > 0: |
| | message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.") |
| | elif tx_idx > 0: |
| | message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.") |
| | else: |
| | message = "Welcome to the lab." |
| |
|
| | |
| | score_parts = [] |
| | if dt_idx > 0: |
| | score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}") |
| | if ai_idx > 0: |
| | score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}") |
| | if tx_idx > 0: |
| | score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}") |
| |
|
| | if score_parts: |
| | message += f" ({', '.join(score_parts)})" |
| |
|
| | |
| | fatal = dt_idx > 0 |
| |
|
| | details = { |
| | "doc_type": verdict["doc_type"], |
| | "ai_generated": verdict["ai_generated"], |
| | "toxicity": verdict["toxicity"], |
| | } |
| |
|
| | return PubVerseError( |
| | code=code, |
| | name=name, |
| | message=message, |
| | step=0, |
| | fatal=fatal, |
| | details=details, |
| | ) |
| |
|
| |
|
| | |
| |
|
| | def empty_input_error() -> PubVerseError: |
| | return PubVerseError( |
| | code="PV-0900", |
| | name="EMPTY_INPUT", |
| | message=( |
| | "You sent us nothing. Literally nothing. " |
| | "The void does not require peer review." |
| | ), |
| | step=0, |
| | fatal=True, |
| | ) |
| |
|
| |
|
| | def unreadable_pdf_error(filename: str = "") -> PubVerseError: |
| | return PubVerseError( |
| | code="PV-0901", |
| | name="UNREADABLE_PDF", |
| | message=( |
| | f"We can't read this PDF{f' ({filename})' if filename else ''}. " |
| | "If your PDF parser can't parse it, maybe it's not a PDF." |
| | ), |
| | step=0, |
| | fatal=True, |
| | ) |
| |
|
| |
|
| | def models_missing_error() -> PubVerseError: |
| | return PubVerseError( |
| | code="PV-0902", |
| | name="MODELS_MISSING", |
| | message=( |
| | "PubGuard models not found. " |
| | "Run: cd pub_check && python scripts/train_pubguard.py" |
| | ), |
| | step=0, |
| | fatal=False, |
| | ) |
| |
|
| |
|
| | def gate_bypassed() -> PubVerseError: |
| | return PubVerseError( |
| | code="PV-0999", |
| | name="GATE_BYPASSED", |
| | message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.", |
| | step=0, |
| | fatal=False, |
| | ) |
| |
|
| |
|
| | |
| |
|
| | def pipeline_error(step: int, sub: int, detail: int, |
| | name: str, message: str, fatal: bool = True) -> PubVerseError: |
| | """Create a pipeline error for steps 1-8.""" |
| | code = f"PV-{step}{sub}{detail:02d}" |
| | return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal) |
| |
|
| |
|
| | |
| | PIPELINE_ERRORS = { |
| | |
| | "PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True), |
| | "PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True), |
| | "PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True), |
| | |
| | "PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one β check the logs.", True), |
| | |
| | "PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True), |
| | "PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True), |
| | |
| | "PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True), |
| | |
| | "PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True), |
| | "PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True), |
| | "PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False), |
| | "PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False), |
| | |
| | "PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner β or the code is.", False), |
| | "PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False), |
| | "PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True), |
| | |
| | "PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False), |
| | |
| | "PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False), |
| | } |
| |
|
| |
|
| | def format_error_line(code: str, name: str = None, message: str = None) -> str: |
| | """Format a single error line for stdout output.""" |
| | if name is None or message is None: |
| | if code in PIPELINE_ERRORS: |
| | name, message, _ = PIPELINE_ERRORS[code] |
| | else: |
| | name = name or "UNKNOWN" |
| | message = message or "An error occurred." |
| | return f"{code} | {name} | {message}" |
| |
|