"""
SciPeerBench — Dataset schema definition.

World's first multi-dimensional scientific fraud benchmark.
Every paper labeled across 14 fraud dimensions simultaneously.
No other dataset does this.
"""

from dataclasses import dataclass, field
from typing import Optional


# ── Fraud taxonomy — 20 types, first time formally defined ───────────────────

FRAUD_TAXONOMY = {
    # Data fabrication
    "FAB-01": "Complete data fabrication",
    "FAB-02": "Partial data fabrication",
    "FAB-03": "Data duplication across papers",

    # Statistical manipulation
    "STAT-01": "P-hacking and selective reporting",
    "STAT-02": "HARKing — hypothesizing after results known",
    "STAT-03": "Impossible statistical values GRIM or SPRITE",
    "STAT-04": "Inflated effect sizes",
    "STAT-05": "Underpowered study with strong claims",

    # Figure fraud
    "FIG-01": "Duplicated figure panels",
    "FIG-02": "Manipulated western blots",
    "FIG-03": "Image brightness or contrast manipulation",

    # Citation fraud
    "CIT-01": "Excessive self-citation ring",
    "CIT-02": "Citation cartel coordinated group",
    "CIT-03": "Unsupported claims without citation",

    # Methodology fraud
    "METH-01": "Causation claimed without RCT",
    "METH-02": "Missing control group",
    "METH-03": "Undisclosed conflicts of interest",

    # Authorship and integrity
    "AUTH-01": "LLM-generated paper",
    "AUTH-02": "Plagiarism detected",
    "AUTH-03": "Retracted paper cited as valid",
}


# ── Paper categories for balanced dataset ────────────────────────────────────

PAPER_CATEGORIES = {
    "CONFIRMED_FRAUD": "Retracted with documented fraud reason",
    "SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet",
    "BORDERLINE":      "Minor issues, not clear fraud",
    "CLEAN":           "High quality, replicated, no concerns",
    "BASELINE_ELITE":  "Nobel prize or landmark papers",
}


# ── Source databases ──────────────────────────────────────────────────────────

DATA_SOURCES = {
    "retraction_watch": "https://retractionwatch.com",
    "pubpeer":          "https://pubpeer.com",
    "pubmed":           "https://pubmed.ncbi.nlm.nih.gov",
    "arxiv":            "https://arxiv.org",
    "semantic_scholar": "https://api.semanticscholar.org",
    "crossref":         "https://api.crossref.org",
}


# ── Target distribution — 1000 papers total ──────────────────────────────────

TARGET_DISTRIBUTION = {
    "CONFIRMED_FRAUD": 300,   # from RetractionWatch
    "SUSPECTED_FRAUD": 200,   # from PubPeer
    "BORDERLINE":      150,   # gray area
    "CLEAN":           250,   # normal good papers
    "BASELINE_ELITE":  100,   # Nobel / landmark
}


@dataclass
class PaperRecord:
    """
    One row in SciPeerBench.
    Every paper labeled across all 14 fraud dimensions.
    This is the most comprehensive fraud labeling schema ever built.
    """

    # ── Identity ──────────────────────────────────────────────────
    paper_id:          str            # SPB-0001, SPB-0002 ...
    doi:               Optional[str]
    title:             str
    authors:           str            # comma separated
    year:              int
    journal:           str
    source_db:         str            # where we got it

    # ── Ground truth ──────────────────────────────────────────────
    category:          str            # from PAPER_CATEGORIES
    is_fraud:          int            # 1 = fraud, 0 = clean
    fraud_confidence:  float          # 0.0 to 1.0
    fraud_types:       str            # comma separated FRAUD_TAXONOMY keys
    retraction_date:   Optional[str]  # YYYY-MM-DD
    retraction_reason: Optional[str]
    pubpeer_url:       Optional[str]

    # ── 14 module scores — auto-generated by running our system ───
    stat_audit_score:       float = 0.0
    figure_forensics_score: float = 0.0
    methodology_score:      float = 0.0
    citation_score:         float = 0.0
    reproducibility_score:  float = 0.0
    novelty_score:          float = 0.0
    grim_score:             float = 0.0
    sprite_score:           float = 0.0
    granularity_score:      float = 0.0
    pcurve_score:           float = 0.0
    effect_size_score:      float = 0.0
    retraction_score:       float = 0.0
    cartel_score:           float = 0.0
    llm_score:              float = 0.0

    # ── Weighted average of all 14 scores ─────────────────────────
    overall_risk_score: float = 0.0

    # ── Paper content ─────────────────────────────────────────────
    abstract_text:   str = ""
    full_text_path:  str = ""   # path to saved full text

    # ── Metadata ──────────────────────────────────────────────────
    field_of_study:  str = ""   # biology, psychology, medicine...
    labeling_method: str = ""   # auto, manual, auto+manual
    labeled_by:      str = "auto"
    notes:           str = ""


# ── CSV column order — exact order in output file ────────────────────────────

CSV_COLUMNS = [
    "paper_id", "doi", "title", "authors", "year",
    "journal", "source_db",
    "category", "is_fraud", "fraud_confidence",
    "fraud_types", "retraction_date", "retraction_reason", "pubpeer_url",
    "stat_audit_score", "figure_forensics_score", "methodology_score",
    "citation_score", "reproducibility_score", "novelty_score",
    "grim_score", "sprite_score", "granularity_score", "pcurve_score",
    "effect_size_score", "retraction_score", "cartel_score", "llm_score",
    "overall_risk_score",
    "abstract_text", "full_text_path",
    "field_of_study", "labeling_method", "labeled_by", "notes",
]