File size: 10,110 Bytes

3e68bea

"""
PubVerse Error Code System
==========================

Structured error codes for the entire PubVerse pipeline.
PubGuard codes (PV-0XXX) encode classifier predictions directly
into the code digits.

Error code format:  PV-SXNN
    S  = Step number (0-8)
    X  = Sub-category
    NN = Detail

PubGuard composite encoding (Step 0):
    PV-0 [doc_type] [ai_detect] [toxicity]
         0=paper     0=human     0=clean
         1=poster    1=ai        1=toxic
         2=abstract
         3=junk
"""

from dataclasses import dataclass
from typing import Dict, Any, Optional

# ── PubGuard (Step 0) error messages ─────────────────────────────

# Snarky messages keyed by doc_type classification
DOC_TYPE_MESSAGES = {
    "scientific_paper": "Welcome to the lab.",
    "poster": (
        "That's a poster, not a paper. We appreciate the aesthetic effort, "
        "but we need Methods, not bullet points on a corkboard."
    ),
    "abstract_only": (
        "We got the trailer but not the movie. "
        "Where's the rest of the paper?"
    ),
    "junk": (
        "That's not a paper, that's a cry for help. Pool party invitations, "
        "invoices, and fantasy football drafts do not constitute peer-reviewed research."
    ),
}

AI_DETECT_MESSAGES = {
    "human": None,  # No message needed
    "ai_generated": (
        "Our classifier thinks a robot wrote this. "
        "The Turing test starts at the Introduction."
    ),
}

TOXICITY_MESSAGES = {
    "clean": None,
    "toxic": (
        "Content flagged as potentially toxic. "
        "Science should be provocative, not offensive."
    ),
}

# Special composite messages for particularly entertaining combos
COMBO_MESSAGES = {
    (3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
    (3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
    (3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
    (1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
    (2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
}

# Class label → index mapping (matches config.py label order)
DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
TOXICITY_INDEX = {"clean": 0, "toxic": 1}


@dataclass
class PubVerseError:
    """Structured pipeline error code."""
    code: str           # e.g. "PV-0300"
    name: str           # e.g. "JUNK_DETECTED"
    message: str        # Human-readable (snarky) description
    step: int           # Pipeline step number
    fatal: bool         # Whether this should halt the pipeline
    details: Optional[Dict[str, Any]] = None  # Optional scores, labels, etc.

    def __str__(self) -> str:
        return f"{self.code} | {self.name} | {self.message}"

    def to_dict(self) -> Dict[str, Any]:
        d = {
            "code": self.code,
            "name": self.name,
            "message": self.message,
            "step": self.step,
            "fatal": self.fatal,
        }
        if self.details:
            d["details"] = self.details
        return d


def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
    """
    Build a PubGuard error code from a screening verdict.

    The code encodes the classifier predictions:
        PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]

    Returns PV-0000 (ALL_CLEAR) if the paper passes.
    """
    dt_label = verdict["doc_type"]["label"]
    ai_label = verdict["ai_generated"]["label"]
    tx_label = verdict["toxicity"]["label"]

    dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
    ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
    tx_idx = TOXICITY_INDEX.get(tx_label, 9)

    code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"

    # Build name
    if verdict["pass"]:
        name = "ALL_CLEAR"
    else:
        parts = []
        if dt_idx > 0:
            parts.append(dt_label.upper())
        if ai_idx > 0:
            parts.append("AI_GENERATED")
        if tx_idx > 0:
            parts.append("TOXIC")
        name = "_AND_".join(parts) if parts else "REJECTED"

    # Build message — check combo messages first, then individual
    combo_key = (dt_idx, ai_idx, tx_idx)
    if combo_key in COMBO_MESSAGES:
        message = COMBO_MESSAGES[combo_key]
    elif dt_idx > 0:
        message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
    elif ai_idx > 0:
        message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
    elif tx_idx > 0:
        message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
    else:
        message = "Welcome to the lab."

    # Add scores to message
    score_parts = []
    if dt_idx > 0:
        score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
    if ai_idx > 0:
        score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
    if tx_idx > 0:
        score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")

    if score_parts:
        message += f" ({', '.join(score_parts)})"

    # Fatal = doc_type is not scientific_paper (hard gate)
    fatal = dt_idx > 0

    details = {
        "doc_type": verdict["doc_type"],
        "ai_generated": verdict["ai_generated"],
        "toxicity": verdict["toxicity"],
    }

    return PubVerseError(
        code=code,
        name=name,
        message=message,
        step=0,
        fatal=fatal,
        details=details,
    )


# ── Special PubGuard errors ──────────────────────────────────────

def empty_input_error() -> PubVerseError:
    return PubVerseError(
        code="PV-0900",
        name="EMPTY_INPUT",
        message=(
            "You sent us nothing. Literally nothing. "
            "The void does not require peer review."
        ),
        step=0,
        fatal=True,
    )


def unreadable_pdf_error(filename: str = "") -> PubVerseError:
    return PubVerseError(
        code="PV-0901",
        name="UNREADABLE_PDF",
        message=(
            f"We can't read this PDF{f' ({filename})' if filename else ''}. "
            "If your PDF parser can't parse it, maybe it's not a PDF."
        ),
        step=0,
        fatal=True,
    )


def models_missing_error() -> PubVerseError:
    return PubVerseError(
        code="PV-0902",
        name="MODELS_MISSING",
        message=(
            "PubGuard models not found. "
            "Run: cd pub_check && python scripts/train_pubguard.py"
        ),
        step=0,
        fatal=False,  # Pipeline can continue without PubGuard
    )


def gate_bypassed() -> PubVerseError:
    return PubVerseError(
        code="PV-0999",
        name="GATE_BYPASSED",
        message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
        step=0,
        fatal=False,
    )


# ── Pipeline step errors (Steps 1-8) ────────────────────────────

def pipeline_error(step: int, sub: int, detail: int,
                   name: str, message: str, fatal: bool = True) -> PubVerseError:
    """Create a pipeline error for steps 1-8."""
    code = f"PV-{step}{sub}{detail:02d}"
    return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)


# Pre-built pipeline errors for bash scripts to reference by name
PIPELINE_ERRORS = {
    # Step 1 — Feature Extraction
    "PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
    "PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
    "PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
    # Step 2 — PubVerse Analysis
    "PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one — check the logs.", True),
    # Step 3 — Artifact Verification
    "PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
    "PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
    # Step 4 — Graph Construction
    "PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
    # Step 5 — 42DeepThought Scoring
    "PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
    "PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
    "PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
    "PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
    # Step 6 — Cluster Analysis
    "PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner — or the code is.", False),
    "PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
    "PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
    # Step 7 — Enrichment
    "PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
    # Step 8 — Visualization
    "PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
}


def format_error_line(code: str, name: str = None, message: str = None) -> str:
    """Format a single error line for stdout output."""
    if name is None or message is None:
        if code in PIPELINE_ERRORS:
            name, message, _ = PIPELINE_ERRORS[code]
        else:
            name = name or "UNKNOWN"
            message = message or "An error occurred."
    return f"{code} | {name} | {message}"