File size: 6,392 Bytes
b5c2bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
SciPeerBench β€” Dataset schema definition.

World's first multi-dimensional scientific fraud benchmark.
Every paper labeled across 14 fraud dimensions simultaneously.
No other dataset does this.
"""

from dataclasses import dataclass, field
from typing import Optional


# ── Fraud taxonomy β€” 20 types, first time formally defined ───────────────────

FRAUD_TAXONOMY = {
    # Data fabrication
    "FAB-01": "Complete data fabrication",
    "FAB-02": "Partial data fabrication",
    "FAB-03": "Data duplication across papers",

    # Statistical manipulation
    "STAT-01": "P-hacking and selective reporting",
    "STAT-02": "HARKing β€” hypothesizing after results known",
    "STAT-03": "Impossible statistical values GRIM or SPRITE",
    "STAT-04": "Inflated effect sizes",
    "STAT-05": "Underpowered study with strong claims",

    # Figure fraud
    "FIG-01": "Duplicated figure panels",
    "FIG-02": "Manipulated western blots",
    "FIG-03": "Image brightness or contrast manipulation",

    # Citation fraud
    "CIT-01": "Excessive self-citation ring",
    "CIT-02": "Citation cartel coordinated group",
    "CIT-03": "Unsupported claims without citation",

    # Methodology fraud
    "METH-01": "Causation claimed without RCT",
    "METH-02": "Missing control group",
    "METH-03": "Undisclosed conflicts of interest",

    # Authorship and integrity
    "AUTH-01": "LLM-generated paper",
    "AUTH-02": "Plagiarism detected",
    "AUTH-03": "Retracted paper cited as valid",
}


# ── Paper categories for balanced dataset ────────────────────────────────────

PAPER_CATEGORIES = {
    "CONFIRMED_FRAUD": "Retracted with documented fraud reason",
    "SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet",
    "BORDERLINE":      "Minor issues, not clear fraud",
    "CLEAN":           "High quality, replicated, no concerns",
    "BASELINE_ELITE":  "Nobel prize or landmark papers",
}


# ── Source databases ──────────────────────────────────────────────────────────

DATA_SOURCES = {
    "retraction_watch": "https://retractionwatch.com",
    "pubpeer":          "https://pubpeer.com",
    "pubmed":           "https://pubmed.ncbi.nlm.nih.gov",
    "arxiv":            "https://arxiv.org",
    "semantic_scholar": "https://api.semanticscholar.org",
    "crossref":         "https://api.crossref.org",
}


# ── Target distribution β€” 1000 papers total ──────────────────────────────────

TARGET_DISTRIBUTION = {
    "CONFIRMED_FRAUD": 300,   # from RetractionWatch
    "SUSPECTED_FRAUD": 200,   # from PubPeer
    "BORDERLINE":      150,   # gray area
    "CLEAN":           250,   # normal good papers
    "BASELINE_ELITE":  100,   # Nobel / landmark
}


@dataclass
class PaperRecord:
    """
    One row in SciPeerBench.
    Every paper labeled across all 14 fraud dimensions.
    This is the most comprehensive fraud labeling schema ever built.
    """

    # ── Identity ──────────────────────────────────────────────────
    paper_id:          str            # SPB-0001, SPB-0002 ...
    doi:               Optional[str]
    title:             str
    authors:           str            # comma separated
    year:              int
    journal:           str
    source_db:         str            # where we got it

    # ── Ground truth ──────────────────────────────────────────────
    category:          str            # from PAPER_CATEGORIES
    is_fraud:          int            # 1 = fraud, 0 = clean
    fraud_confidence:  float          # 0.0 to 1.0
    fraud_types:       str            # comma separated FRAUD_TAXONOMY keys
    retraction_date:   Optional[str]  # YYYY-MM-DD
    retraction_reason: Optional[str]
    pubpeer_url:       Optional[str]

    # ── 14 module scores β€” auto-generated by running our system ───
    stat_audit_score:       float = 0.0
    figure_forensics_score: float = 0.0
    methodology_score:      float = 0.0
    citation_score:         float = 0.0
    reproducibility_score:  float = 0.0
    novelty_score:          float = 0.0
    grim_score:             float = 0.0
    sprite_score:           float = 0.0
    granularity_score:      float = 0.0
    pcurve_score:           float = 0.0
    effect_size_score:      float = 0.0
    retraction_score:       float = 0.0
    cartel_score:           float = 0.0
    llm_score:              float = 0.0

    # ── Weighted average of all 14 scores ─────────────────────────
    overall_risk_score: float = 0.0

    # ── Paper content ─────────────────────────────────────────────
    abstract_text:   str = ""
    full_text_path:  str = ""   # path to saved full text

    # ── Metadata ──────────────────────────────────────────────────
    field_of_study:  str = ""   # biology, psychology, medicine...
    labeling_method: str = ""   # auto, manual, auto+manual
    labeled_by:      str = "auto"
    notes:           str = ""


# ── CSV column order β€” exact order in output file ────────────────────────────

CSV_COLUMNS = [
    "paper_id", "doi", "title", "authors", "year",
    "journal", "source_db",
    "category", "is_fraud", "fraud_confidence",
    "fraud_types", "retraction_date", "retraction_reason", "pubpeer_url",
    "stat_audit_score", "figure_forensics_score", "methodology_score",
    "citation_score", "reproducibility_score", "novelty_score",
    "grim_score", "sprite_score", "granularity_score", "pcurve_score",
    "effect_size_score", "retraction_score", "cartel_score", "llm_score",
    "overall_risk_score",
    "abstract_text", "full_text_path",
    "field_of_study", "labeling_method", "labeled_by", "notes",
]