Spaces:
Running
Running
| """ | |
| SciPeerBench β Dataset schema definition. | |
| World's first multi-dimensional scientific fraud benchmark. | |
| Every paper labeled across 14 fraud dimensions simultaneously. | |
| No other dataset does this. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| # ββ Fraud taxonomy β 20 types, first time formally defined βββββββββββββββββββ | |
| FRAUD_TAXONOMY = { | |
| # Data fabrication | |
| "FAB-01": "Complete data fabrication", | |
| "FAB-02": "Partial data fabrication", | |
| "FAB-03": "Data duplication across papers", | |
| # Statistical manipulation | |
| "STAT-01": "P-hacking and selective reporting", | |
| "STAT-02": "HARKing β hypothesizing after results known", | |
| "STAT-03": "Impossible statistical values GRIM or SPRITE", | |
| "STAT-04": "Inflated effect sizes", | |
| "STAT-05": "Underpowered study with strong claims", | |
| # Figure fraud | |
| "FIG-01": "Duplicated figure panels", | |
| "FIG-02": "Manipulated western blots", | |
| "FIG-03": "Image brightness or contrast manipulation", | |
| # Citation fraud | |
| "CIT-01": "Excessive self-citation ring", | |
| "CIT-02": "Citation cartel coordinated group", | |
| "CIT-03": "Unsupported claims without citation", | |
| # Methodology fraud | |
| "METH-01": "Causation claimed without RCT", | |
| "METH-02": "Missing control group", | |
| "METH-03": "Undisclosed conflicts of interest", | |
| # Authorship and integrity | |
| "AUTH-01": "LLM-generated paper", | |
| "AUTH-02": "Plagiarism detected", | |
| "AUTH-03": "Retracted paper cited as valid", | |
| } | |
| # ββ Paper categories for balanced dataset ββββββββββββββββββββββββββββββββββββ | |
| PAPER_CATEGORIES = { | |
| "CONFIRMED_FRAUD": "Retracted with documented fraud reason", | |
| "SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet", | |
| "BORDERLINE": "Minor issues, not clear fraud", | |
| "CLEAN": "High quality, replicated, no concerns", | |
| "BASELINE_ELITE": "Nobel prize or landmark papers", | |
| } | |
| # ββ Source databases ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DATA_SOURCES = { | |
| "retraction_watch": "https://retractionwatch.com", | |
| "pubpeer": "https://pubpeer.com", | |
| "pubmed": "https://pubmed.ncbi.nlm.nih.gov", | |
| "arxiv": "https://arxiv.org", | |
| "semantic_scholar": "https://api.semanticscholar.org", | |
| "crossref": "https://api.crossref.org", | |
| } | |
| # ββ Target distribution β 1000 papers total ββββββββββββββββββββββββββββββββββ | |
| TARGET_DISTRIBUTION = { | |
| "CONFIRMED_FRAUD": 300, # from RetractionWatch | |
| "SUSPECTED_FRAUD": 200, # from PubPeer | |
| "BORDERLINE": 150, # gray area | |
| "CLEAN": 250, # normal good papers | |
| "BASELINE_ELITE": 100, # Nobel / landmark | |
| } | |
| class PaperRecord: | |
| """ | |
| One row in SciPeerBench. | |
| Every paper labeled across all 14 fraud dimensions. | |
| This is the most comprehensive fraud labeling schema ever built. | |
| """ | |
| # ββ Identity ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| paper_id: str # SPB-0001, SPB-0002 ... | |
| doi: Optional[str] | |
| title: str | |
| authors: str # comma separated | |
| year: int | |
| journal: str | |
| source_db: str # where we got it | |
| # ββ Ground truth ββββββββββββββββββββββββββββββββββββββββββββββ | |
| category: str # from PAPER_CATEGORIES | |
| is_fraud: int # 1 = fraud, 0 = clean | |
| fraud_confidence: float # 0.0 to 1.0 | |
| fraud_types: str # comma separated FRAUD_TAXONOMY keys | |
| retraction_date: Optional[str] # YYYY-MM-DD | |
| retraction_reason: Optional[str] | |
| pubpeer_url: Optional[str] | |
| # ββ 14 module scores β auto-generated by running our system βββ | |
| stat_audit_score: float = 0.0 | |
| figure_forensics_score: float = 0.0 | |
| methodology_score: float = 0.0 | |
| citation_score: float = 0.0 | |
| reproducibility_score: float = 0.0 | |
| novelty_score: float = 0.0 | |
| grim_score: float = 0.0 | |
| sprite_score: float = 0.0 | |
| granularity_score: float = 0.0 | |
| pcurve_score: float = 0.0 | |
| effect_size_score: float = 0.0 | |
| retraction_score: float = 0.0 | |
| cartel_score: float = 0.0 | |
| llm_score: float = 0.0 | |
| # ββ Weighted average of all 14 scores βββββββββββββββββββββββββ | |
| overall_risk_score: float = 0.0 | |
| # ββ Paper content βββββββββββββββββββββββββββββββββββββββββββββ | |
| abstract_text: str = "" | |
| full_text_path: str = "" # path to saved full text | |
| # ββ Metadata ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| field_of_study: str = "" # biology, psychology, medicine... | |
| labeling_method: str = "" # auto, manual, auto+manual | |
| labeled_by: str = "auto" | |
| notes: str = "" | |
| # ββ CSV column order β exact order in output file ββββββββββββββββββββββββββββ | |
| CSV_COLUMNS = [ | |
| "paper_id", "doi", "title", "authors", "year", | |
| "journal", "source_db", | |
| "category", "is_fraud", "fraud_confidence", | |
| "fraud_types", "retraction_date", "retraction_reason", "pubpeer_url", | |
| "stat_audit_score", "figure_forensics_score", "methodology_score", | |
| "citation_score", "reproducibility_score", "novelty_score", | |
| "grim_score", "sprite_score", "granularity_score", "pcurve_score", | |
| "effect_size_score", "retraction_score", "cartel_score", "llm_score", | |
| "overall_risk_score", | |
| "abstract_text", "full_text_path", | |
| "field_of_study", "labeling_method", "labeled_by", "notes", | |
| ] |