Abu-Sameer-66
fix: add requests dependency β€” v2.3.0 hotfix
b625b53
"""
SciPeerBench β€” Dataset schema definition.
World's first multi-dimensional scientific fraud benchmark.
Every paper labeled across 14 fraud dimensions simultaneously.
No other dataset does this.
"""
from dataclasses import dataclass, field
from typing import Optional
# ── Fraud taxonomy β€” 20 types, first time formally defined ───────────────────
FRAUD_TAXONOMY = {
# Data fabrication
"FAB-01": "Complete data fabrication",
"FAB-02": "Partial data fabrication",
"FAB-03": "Data duplication across papers",
# Statistical manipulation
"STAT-01": "P-hacking and selective reporting",
"STAT-02": "HARKing β€” hypothesizing after results known",
"STAT-03": "Impossible statistical values GRIM or SPRITE",
"STAT-04": "Inflated effect sizes",
"STAT-05": "Underpowered study with strong claims",
# Figure fraud
"FIG-01": "Duplicated figure panels",
"FIG-02": "Manipulated western blots",
"FIG-03": "Image brightness or contrast manipulation",
# Citation fraud
"CIT-01": "Excessive self-citation ring",
"CIT-02": "Citation cartel coordinated group",
"CIT-03": "Unsupported claims without citation",
# Methodology fraud
"METH-01": "Causation claimed without RCT",
"METH-02": "Missing control group",
"METH-03": "Undisclosed conflicts of interest",
# Authorship and integrity
"AUTH-01": "LLM-generated paper",
"AUTH-02": "Plagiarism detected",
"AUTH-03": "Retracted paper cited as valid",
}
# ── Paper categories for balanced dataset ────────────────────────────────────
PAPER_CATEGORIES = {
"CONFIRMED_FRAUD": "Retracted with documented fraud reason",
"SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet",
"BORDERLINE": "Minor issues, not clear fraud",
"CLEAN": "High quality, replicated, no concerns",
"BASELINE_ELITE": "Nobel prize or landmark papers",
}
# ── Source databases ──────────────────────────────────────────────────────────
DATA_SOURCES = {
"retraction_watch": "https://retractionwatch.com",
"pubpeer": "https://pubpeer.com",
"pubmed": "https://pubmed.ncbi.nlm.nih.gov",
"arxiv": "https://arxiv.org",
"semantic_scholar": "https://api.semanticscholar.org",
"crossref": "https://api.crossref.org",
}
# ── Target distribution β€” 1000 papers total ──────────────────────────────────
TARGET_DISTRIBUTION = {
"CONFIRMED_FRAUD": 300, # from RetractionWatch
"SUSPECTED_FRAUD": 200, # from PubPeer
"BORDERLINE": 150, # gray area
"CLEAN": 250, # normal good papers
"BASELINE_ELITE": 100, # Nobel / landmark
}
@dataclass
class PaperRecord:
"""
One row in SciPeerBench.
Every paper labeled across all 14 fraud dimensions.
This is the most comprehensive fraud labeling schema ever built.
"""
# ── Identity ──────────────────────────────────────────────────
paper_id: str # SPB-0001, SPB-0002 ...
doi: Optional[str]
title: str
authors: str # comma separated
year: int
journal: str
source_db: str # where we got it
# ── Ground truth ──────────────────────────────────────────────
category: str # from PAPER_CATEGORIES
is_fraud: int # 1 = fraud, 0 = clean
fraud_confidence: float # 0.0 to 1.0
fraud_types: str # comma separated FRAUD_TAXONOMY keys
retraction_date: Optional[str] # YYYY-MM-DD
retraction_reason: Optional[str]
pubpeer_url: Optional[str]
# ── 14 module scores β€” auto-generated by running our system ───
stat_audit_score: float = 0.0
figure_forensics_score: float = 0.0
methodology_score: float = 0.0
citation_score: float = 0.0
reproducibility_score: float = 0.0
novelty_score: float = 0.0
grim_score: float = 0.0
sprite_score: float = 0.0
granularity_score: float = 0.0
pcurve_score: float = 0.0
effect_size_score: float = 0.0
retraction_score: float = 0.0
cartel_score: float = 0.0
llm_score: float = 0.0
# ── Weighted average of all 14 scores ─────────────────────────
overall_risk_score: float = 0.0
# ── Paper content ─────────────────────────────────────────────
abstract_text: str = ""
full_text_path: str = "" # path to saved full text
# ── Metadata ──────────────────────────────────────────────────
field_of_study: str = "" # biology, psychology, medicine...
labeling_method: str = "" # auto, manual, auto+manual
labeled_by: str = "auto"
notes: str = ""
# ── CSV column order β€” exact order in output file ────────────────────────────
CSV_COLUMNS = [
"paper_id", "doi", "title", "authors", "year",
"journal", "source_db",
"category", "is_fraud", "fraud_confidence",
"fraud_types", "retraction_date", "retraction_reason", "pubpeer_url",
"stat_audit_score", "figure_forensics_score", "methodology_score",
"citation_score", "reproducibility_score", "novelty_score",
"grim_score", "sprite_score", "granularity_score", "pcurve_score",
"effect_size_score", "retraction_score", "cartel_score", "llm_score",
"overall_risk_score",
"abstract_text", "full_text_path",
"field_of_study", "labeling_method", "labeled_by", "notes",
]