Spaces:
Sleeping
Sleeping
File size: 6,392 Bytes
b5c2bb1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | """
SciPeerBench β Dataset schema definition.
World's first multi-dimensional scientific fraud benchmark.
Every paper labeled across 14 fraud dimensions simultaneously.
No other dataset does this.
"""
from dataclasses import dataclass, field
from typing import Optional
# ββ Fraud taxonomy β 20 types, first time formally defined βββββββββββββββββββ
FRAUD_TAXONOMY = {
# Data fabrication
"FAB-01": "Complete data fabrication",
"FAB-02": "Partial data fabrication",
"FAB-03": "Data duplication across papers",
# Statistical manipulation
"STAT-01": "P-hacking and selective reporting",
"STAT-02": "HARKing β hypothesizing after results known",
"STAT-03": "Impossible statistical values GRIM or SPRITE",
"STAT-04": "Inflated effect sizes",
"STAT-05": "Underpowered study with strong claims",
# Figure fraud
"FIG-01": "Duplicated figure panels",
"FIG-02": "Manipulated western blots",
"FIG-03": "Image brightness or contrast manipulation",
# Citation fraud
"CIT-01": "Excessive self-citation ring",
"CIT-02": "Citation cartel coordinated group",
"CIT-03": "Unsupported claims without citation",
# Methodology fraud
"METH-01": "Causation claimed without RCT",
"METH-02": "Missing control group",
"METH-03": "Undisclosed conflicts of interest",
# Authorship and integrity
"AUTH-01": "LLM-generated paper",
"AUTH-02": "Plagiarism detected",
"AUTH-03": "Retracted paper cited as valid",
}
# ββ Paper categories for balanced dataset ββββββββββββββββββββββββββββββββββββ
PAPER_CATEGORIES = {
"CONFIRMED_FRAUD": "Retracted with documented fraud reason",
"SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet",
"BORDERLINE": "Minor issues, not clear fraud",
"CLEAN": "High quality, replicated, no concerns",
"BASELINE_ELITE": "Nobel prize or landmark papers",
}
# ββ Source databases ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DATA_SOURCES = {
"retraction_watch": "https://retractionwatch.com",
"pubpeer": "https://pubpeer.com",
"pubmed": "https://pubmed.ncbi.nlm.nih.gov",
"arxiv": "https://arxiv.org",
"semantic_scholar": "https://api.semanticscholar.org",
"crossref": "https://api.crossref.org",
}
# ββ Target distribution β 1000 papers total ββββββββββββββββββββββββββββββββββ
TARGET_DISTRIBUTION = {
"CONFIRMED_FRAUD": 300, # from RetractionWatch
"SUSPECTED_FRAUD": 200, # from PubPeer
"BORDERLINE": 150, # gray area
"CLEAN": 250, # normal good papers
"BASELINE_ELITE": 100, # Nobel / landmark
}
@dataclass
class PaperRecord:
"""
One row in SciPeerBench.
Every paper labeled across all 14 fraud dimensions.
This is the most comprehensive fraud labeling schema ever built.
"""
# ββ Identity ββββββββββββββββββββββββββββββββββββββββββββββββββ
paper_id: str # SPB-0001, SPB-0002 ...
doi: Optional[str]
title: str
authors: str # comma separated
year: int
journal: str
source_db: str # where we got it
# ββ Ground truth ββββββββββββββββββββββββββββββββββββββββββββββ
category: str # from PAPER_CATEGORIES
is_fraud: int # 1 = fraud, 0 = clean
fraud_confidence: float # 0.0 to 1.0
fraud_types: str # comma separated FRAUD_TAXONOMY keys
retraction_date: Optional[str] # YYYY-MM-DD
retraction_reason: Optional[str]
pubpeer_url: Optional[str]
# ββ 14 module scores β auto-generated by running our system βββ
stat_audit_score: float = 0.0
figure_forensics_score: float = 0.0
methodology_score: float = 0.0
citation_score: float = 0.0
reproducibility_score: float = 0.0
novelty_score: float = 0.0
grim_score: float = 0.0
sprite_score: float = 0.0
granularity_score: float = 0.0
pcurve_score: float = 0.0
effect_size_score: float = 0.0
retraction_score: float = 0.0
cartel_score: float = 0.0
llm_score: float = 0.0
# ββ Weighted average of all 14 scores βββββββββββββββββββββββββ
overall_risk_score: float = 0.0
# ββ Paper content βββββββββββββββββββββββββββββββββββββββββββββ
abstract_text: str = ""
full_text_path: str = "" # path to saved full text
# ββ Metadata ββββββββββββββββββββββββββββββββββββββββββββββββββ
field_of_study: str = "" # biology, psychology, medicine...
labeling_method: str = "" # auto, manual, auto+manual
labeled_by: str = "auto"
notes: str = ""
# ββ CSV column order β exact order in output file ββββββββββββββββββββββββββββ
CSV_COLUMNS = [
"paper_id", "doi", "title", "authors", "year",
"journal", "source_db",
"category", "is_fraud", "fraud_confidence",
"fraud_types", "retraction_date", "retraction_reason", "pubpeer_url",
"stat_audit_score", "figure_forensics_score", "methodology_score",
"citation_score", "reproducibility_score", "novelty_score",
"grim_score", "sprite_score", "granularity_score", "pcurve_score",
"effect_size_score", "retraction_score", "cartel_score", "llm_score",
"overall_risk_score",
"abstract_text", "full_text_path",
"field_of_study", "labeling_method", "labeled_by", "notes",
] |