""" SciPeerBench — Dataset schema definition. World's first multi-dimensional scientific fraud benchmark. Every paper labeled across 14 fraud dimensions simultaneously. No other dataset does this. """ from dataclasses import dataclass, field from typing import Optional # ── Fraud taxonomy — 20 types, first time formally defined ─────────────────── FRAUD_TAXONOMY = { # Data fabrication "FAB-01": "Complete data fabrication", "FAB-02": "Partial data fabrication", "FAB-03": "Data duplication across papers", # Statistical manipulation "STAT-01": "P-hacking and selective reporting", "STAT-02": "HARKing — hypothesizing after results known", "STAT-03": "Impossible statistical values GRIM or SPRITE", "STAT-04": "Inflated effect sizes", "STAT-05": "Underpowered study with strong claims", # Figure fraud "FIG-01": "Duplicated figure panels", "FIG-02": "Manipulated western blots", "FIG-03": "Image brightness or contrast manipulation", # Citation fraud "CIT-01": "Excessive self-citation ring", "CIT-02": "Citation cartel coordinated group", "CIT-03": "Unsupported claims without citation", # Methodology fraud "METH-01": "Causation claimed without RCT", "METH-02": "Missing control group", "METH-03": "Undisclosed conflicts of interest", # Authorship and integrity "AUTH-01": "LLM-generated paper", "AUTH-02": "Plagiarism detected", "AUTH-03": "Retracted paper cited as valid", } # ── Paper categories for balanced dataset ──────────────────────────────────── PAPER_CATEGORIES = { "CONFIRMED_FRAUD": "Retracted with documented fraud reason", "SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet", "BORDERLINE": "Minor issues, not clear fraud", "CLEAN": "High quality, replicated, no concerns", "BASELINE_ELITE": "Nobel prize or landmark papers", } # ── Source databases ────────────────────────────────────────────────────────── DATA_SOURCES = { "retraction_watch": "https://retractionwatch.com", "pubpeer": "https://pubpeer.com", "pubmed": "https://pubmed.ncbi.nlm.nih.gov", "arxiv": "https://arxiv.org", "semantic_scholar": "https://api.semanticscholar.org", "crossref": "https://api.crossref.org", } # ── Target distribution — 1000 papers total ────────────────────────────────── TARGET_DISTRIBUTION = { "CONFIRMED_FRAUD": 300, # from RetractionWatch "SUSPECTED_FRAUD": 200, # from PubPeer "BORDERLINE": 150, # gray area "CLEAN": 250, # normal good papers "BASELINE_ELITE": 100, # Nobel / landmark } @dataclass class PaperRecord: """ One row in SciPeerBench. Every paper labeled across all 14 fraud dimensions. This is the most comprehensive fraud labeling schema ever built. """ # ── Identity ────────────────────────────────────────────────── paper_id: str # SPB-0001, SPB-0002 ... doi: Optional[str] title: str authors: str # comma separated year: int journal: str source_db: str # where we got it # ── Ground truth ────────────────────────────────────────────── category: str # from PAPER_CATEGORIES is_fraud: int # 1 = fraud, 0 = clean fraud_confidence: float # 0.0 to 1.0 fraud_types: str # comma separated FRAUD_TAXONOMY keys retraction_date: Optional[str] # YYYY-MM-DD retraction_reason: Optional[str] pubpeer_url: Optional[str] # ── 14 module scores — auto-generated by running our system ─── stat_audit_score: float = 0.0 figure_forensics_score: float = 0.0 methodology_score: float = 0.0 citation_score: float = 0.0 reproducibility_score: float = 0.0 novelty_score: float = 0.0 grim_score: float = 0.0 sprite_score: float = 0.0 granularity_score: float = 0.0 pcurve_score: float = 0.0 effect_size_score: float = 0.0 retraction_score: float = 0.0 cartel_score: float = 0.0 llm_score: float = 0.0 # ── Weighted average of all 14 scores ───────────────────────── overall_risk_score: float = 0.0 # ── Paper content ───────────────────────────────────────────── abstract_text: str = "" full_text_path: str = "" # path to saved full text # ── Metadata ────────────────────────────────────────────────── field_of_study: str = "" # biology, psychology, medicine... labeling_method: str = "" # auto, manual, auto+manual labeled_by: str = "auto" notes: str = "" # ── CSV column order — exact order in output file ──────────────────────────── CSV_COLUMNS = [ "paper_id", "doi", "title", "authors", "year", "journal", "source_db", "category", "is_fraud", "fraud_confidence", "fraud_types", "retraction_date", "retraction_reason", "pubpeer_url", "stat_audit_score", "figure_forensics_score", "methodology_score", "citation_score", "reproducibility_score", "novelty_score", "grim_score", "sprite_score", "granularity_score", "pcurve_score", "effect_size_score", "retraction_score", "cartel_score", "llm_score", "overall_risk_score", "abstract_text", "full_text_path", "field_of_study", "labeling_method", "labeled_by", "notes", ]