jimnoneill
/

pubguard-classifier

@@ -1,274 +0,0 @@
-"""
-PubVerse Error Code System
-==========================
-Structured error codes for the entire PubVerse pipeline.
-PubGuard codes (PV-0XXX) encode classifier predictions directly
-into the code digits.
-Error code format:  PV-SXNN
-    S  = Step number (0-8)
-    X  = Sub-category
-    NN = Detail
-PubGuard composite encoding (Step 0):
-    PV-0 [doc_type] [ai_detect] [toxicity]
-         0=paper     0=human     0=clean
-         1=poster    1=ai        1=toxic
-         2=abstract
-         3=junk
-"""
-from dataclasses import dataclass
-from typing import Dict, Any, Optional
-# ── PubGuard (Step 0) error messages ─────────────────────────────
-# Snarky messages keyed by doc_type classification
-DOC_TYPE_MESSAGES = {
-    "scientific_paper": "Welcome to the lab.",
-    "poster": (
-        "That's a poster, not a paper. We appreciate the aesthetic effort, "
-        "but we need Methods, not bullet points on a corkboard."
-    ),
-    "abstract_only": (
-        "We got the trailer but not the movie. "
-        "Where's the rest of the paper?"
-    ),
-    "junk": (
-        "That's not a paper, that's a cry for help. Pool party invitations, "
-        "invoices, and fantasy football drafts do not constitute peer-reviewed research."
-    ),
-}
-AI_DETECT_MESSAGES = {
-    "human": None,  # No message needed
-    "ai_generated": (
-        "Our classifier thinks a robot wrote this. "
-        "The Turing test starts at the Introduction."
-    ),
-}
-TOXICITY_MESSAGES = {
-    "clean": None,
-    "toxic": (
-        "Content flagged as potentially toxic. "
-        "Science should be provocative, not offensive."
-    ),
-}
-# Special composite messages for particularly entertaining combos
-COMBO_MESSAGES = {
-    (3, 1, 0): "AI-generated junk. Congratulations, you've automated mediocrity.",
-    (3, 0, 1): "Toxic junk. This is somehow worse than a pool party flyer.",
-    (3, 1, 1): "The trifecta. AI-generated toxic junk. We'd be impressed if we weren't horrified.",
-    (1, 1, 0): "An AI-generated poster. The future is here and it's making conference posters.",
-    (2, 1, 0): "An AI-generated abstract with no paper attached. Peak efficiency.",
-}
-# Class label → index mapping (matches config.py label order)
-DOC_TYPE_INDEX = {"scientific_paper": 0, "poster": 1, "abstract_only": 2, "junk": 3}
-AI_DETECT_INDEX = {"human": 0, "ai_generated": 1}
-TOXICITY_INDEX = {"clean": 0, "toxic": 1}
-@dataclass
-class PubVerseError:
-    """Structured pipeline error code."""
-    code: str           # e.g. "PV-0300"
-    name: str           # e.g. "JUNK_DETECTED"
-    message: str        # Human-readable (snarky) description
-    step: int           # Pipeline step number
-    fatal: bool         # Whether this should halt the pipeline
-    details: Optional[Dict[str, Any]] = None  # Optional scores, labels, etc.
-    def __str__(self) -> str:
-        return f"{self.code} | {self.name} | {self.message}"
-    def to_dict(self) -> Dict[str, Any]:
-        d = {
-            "code": self.code,
-            "name": self.name,
-            "message": self.message,
-            "step": self.step,
-            "fatal": self.fatal,
-        }
-        if self.details:
-            d["details"] = self.details
-        return d
-def build_pubguard_error(verdict: Dict[str, Any]) -> PubVerseError:
-    """
-    Build a PubGuard error code from a screening verdict.
-    The code encodes the classifier predictions:
-        PV-0[doc_type_idx][ai_detect_idx][toxicity_idx]
-    Returns PV-0000 (ALL_CLEAR) if the paper passes.
-    """
-    dt_label = verdict["doc_type"]["label"]
-    ai_label = verdict["ai_generated"]["label"]
-    tx_label = verdict["toxicity"]["label"]
-    dt_idx = DOC_TYPE_INDEX.get(dt_label, 9)
-    ai_idx = AI_DETECT_INDEX.get(ai_label, 9)
-    tx_idx = TOXICITY_INDEX.get(tx_label, 9)
-    code = f"PV-0{dt_idx}{ai_idx}{tx_idx}"
-    # Build name
-    if verdict["pass"]:
-        name = "ALL_CLEAR"
-    else:
-        parts = []
-        if dt_idx > 0:
-            parts.append(dt_label.upper())
-        if ai_idx > 0:
-            parts.append("AI_GENERATED")
-        if tx_idx > 0:
-            parts.append("TOXIC")
-        name = "_AND_".join(parts) if parts else "REJECTED"
-    # Build message — check combo messages first, then individual
-    combo_key = (dt_idx, ai_idx, tx_idx)
-    if combo_key in COMBO_MESSAGES:
-        message = COMBO_MESSAGES[combo_key]
-    elif dt_idx > 0:
-        message = DOC_TYPE_MESSAGES.get(dt_label, "Unknown document type.")
-    elif ai_idx > 0:
-        message = AI_DETECT_MESSAGES.get(ai_label, "AI content detected.")
-    elif tx_idx > 0:
-        message = TOXICITY_MESSAGES.get(tx_label, "Toxic content detected.")
-    else:
-        message = "Welcome to the lab."
-    # Add scores to message
-    score_parts = []
-    if dt_idx > 0:
-        score_parts.append(f"doc_type={dt_label}:{verdict['doc_type']['score']:.3f}")
-    if ai_idx > 0:
-        score_parts.append(f"ai={verdict['ai_generated']['score']:.3f}")
-    if tx_idx > 0:
-        score_parts.append(f"toxicity={verdict['toxicity']['score']:.3f}")
-    if score_parts:
-        message += f" ({', '.join(score_parts)})"
-    # Fatal = doc_type is not scientific_paper (hard gate)
-    fatal = dt_idx > 0
-    details = {
-        "doc_type": verdict["doc_type"],
-        "ai_generated": verdict["ai_generated"],
-        "toxicity": verdict["toxicity"],
-    }
-    return PubVerseError(
-        code=code,
-        name=name,
-        message=message,
-        step=0,
-        fatal=fatal,
-        details=details,
-    )
-# ── Special PubGuard errors ──────────────────────────────────────
-def empty_input_error() -> PubVerseError:
-    return PubVerseError(
-        code="PV-0900",
-        name="EMPTY_INPUT",
-        message=(
-            "You sent us nothing. Literally nothing. "
-            "The void does not require peer review."
-        ),
-        step=0,
-        fatal=True,
-    )
-def unreadable_pdf_error(filename: str = "") -> PubVerseError:
-    return PubVerseError(
-        code="PV-0901",
-        name="UNREADABLE_PDF",
-        message=(
-            f"We can't read this PDF{f' ({filename})' if filename else ''}. "
-            "If your PDF parser can't parse it, maybe it's not a PDF."
-        ),
-        step=0,
-        fatal=True,
-    )
-def models_missing_error() -> PubVerseError:
-    return PubVerseError(
-        code="PV-0902",
-        name="MODELS_MISSING",
-        message=(
-            "PubGuard models not found. "
-            "Run: cd pub_check && python scripts/train_pubguard.py"
-        ),
-        step=0,
-        fatal=False,  # Pipeline can continue without PubGuard
-    )
-def gate_bypassed() -> PubVerseError:
-    return PubVerseError(
-        code="PV-0999",
-        name="GATE_BYPASSED",
-        message="PubGuard screening bypassed (PUBGUARD_STRICT=0). Proceeding on faith. Good luck.",
-        step=0,
-        fatal=False,
-    )
-# ── Pipeline step errors (Steps 1-8) ────────────────────────────
-def pipeline_error(step: int, sub: int, detail: int,
-                   name: str, message: str, fatal: bool = True) -> PubVerseError:
-    """Create a pipeline error for steps 1-8."""
-    code = f"PV-{step}{sub}{detail:02d}"
-    return PubVerseError(code=code, name=name, message=message, step=step, fatal=fatal)
-# Pre-built pipeline errors for bash scripts to reference by name
-PIPELINE_ERRORS = {
-    # Step 1 — Feature Extraction
-    "PV-1100": ("EXTRACTION_FAILED", "VLM feature extraction failed. Your PDF defeated a 7-billion parameter model. Impressive, actually.", True),
-    "PV-1101": ("NO_TSV_OUTPUT", "Extraction ran but produced no output file. The VLM started reading and apparently gave up.", True),
-    "PV-1200": ("VLM_NOT_FOUND", "Feature extraction script not found. Did someone move it?", True),
-    # Step 2 — PubVerse Analysis
-    "PV-2100": ("ANALYSIS_FAILED", "PubVerse analysis crashed. This is the big one — check the logs.", True),
-    # Step 3 — Artifact Verification
-    "PV-3100": ("MATRIX_MISSING", "Unified adjacency matrix not found. Something went sideways in clustering.", True),
-    "PV-3101": ("PICKLE_MISSING", "Impact analysis pickle not found. The most important intermediate file is AWOL.", True),
-    # Step 4 — Graph Construction
-    "PV-4100": ("GRAPH_FAILED", "Graph construction failed. The nodes existed briefly, like a postdoc's optimism.", True),
-    # Step 5 — 42DeepThought Scoring
-    "PV-5100": ("SCORING_FAILED", "GNN scoring crashed. Check CUDA, check the graph, check your assumptions.", True),
-    "PV-5101": ("NO_GRAPH_PICKLE", "Graph pickle not found for scoring. Step 4 must have failed silently.", True),
-    "PV-5102": ("NO_SCORES_OUTPUT", "Scoring ran but produced no TSV. The GNN had nothing to say about your paper.", False),
-    "PV-5200": ("DEEPTHOUGHT_MISSING", "42DeepThought directory not found. The entire scoring engine is missing.", False),
-    # Step 6 — Cluster Analysis
-    "PV-6100": ("CLUSTER_FAILED", "Cluster analysis crashed. Your paper is a loner — or the code is.", False),
-    "PV-6102": ("DB_TIMEOUT", "Cluster database population timed out (>1 hour). The LLM is still thinking.", False),
-    "PV-6103": ("NO_QUERY_ID", "Could not extract query paper ID from TSV. Who are you, even?", True),
-    # Step 7 — Enrichment
-    "PV-7100": ("ENRICH_FAILED", "Enrichment script crashed. The data refused to be unified.", False),
-    # Step 8 — Visualization
-    "PV-8100": ("VIZ_FAILED", "Visualization generation failed. You'll have to imagine the graph.", False),
-}
-def format_error_line(code: str, name: str = None, message: str = None) -> str:
-    """Format a single error line for stdout output."""
-    if name is None or message is None:
-        if code in PIPELINE_ERRORS:
-            name, message, _ = PIPELINE_ERRORS[code]
-        else:
-            name = name or "UNKNOWN"
-            message = message or "An error occurred."
-    return f"{code} | {name} | {message}"