Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Running

SciPeerAI-API / data /scipeerai_bench /schema.py

Abu-Sameer-66

fix: add requests dependency — v2.3.0 hotfix

b625b53 about 10 hours ago

6.39 kB

	"""
	SciPeerBench — Dataset schema definition.

	World's first multi-dimensional scientific fraud benchmark.
	Every paper labeled across 14 fraud dimensions simultaneously.
	No other dataset does this.
	"""

	from dataclasses import dataclass, field
	from typing import Optional


	# ── Fraud taxonomy — 20 types, first time formally defined ───────────────────

	FRAUD_TAXONOMY = {
	# Data fabrication
	"FAB-01": "Complete data fabrication",
	"FAB-02": "Partial data fabrication",
	"FAB-03": "Data duplication across papers",

	# Statistical manipulation
	"STAT-01": "P-hacking and selective reporting",
	"STAT-02": "HARKing — hypothesizing after results known",
	"STAT-03": "Impossible statistical values GRIM or SPRITE",
	"STAT-04": "Inflated effect sizes",
	"STAT-05": "Underpowered study with strong claims",

	# Figure fraud
	"FIG-01": "Duplicated figure panels",
	"FIG-02": "Manipulated western blots",
	"FIG-03": "Image brightness or contrast manipulation",

	# Citation fraud
	"CIT-01": "Excessive self-citation ring",
	"CIT-02": "Citation cartel coordinated group",
	"CIT-03": "Unsupported claims without citation",

	# Methodology fraud
	"METH-01": "Causation claimed without RCT",
	"METH-02": "Missing control group",
	"METH-03": "Undisclosed conflicts of interest",

	# Authorship and integrity
	"AUTH-01": "LLM-generated paper",
	"AUTH-02": "Plagiarism detected",
	"AUTH-03": "Retracted paper cited as valid",
	}


	# ── Paper categories for balanced dataset ────────────────────────────────────

	PAPER_CATEGORIES = {
	"CONFIRMED_FRAUD": "Retracted with documented fraud reason",
	"SUSPECTED_FRAUD": "PubPeer flagged, not retracted yet",
	"BORDERLINE": "Minor issues, not clear fraud",
	"CLEAN": "High quality, replicated, no concerns",
	"BASELINE_ELITE": "Nobel prize or landmark papers",
	}


	# ── Source databases ──────────────────────────────────────────────────────────

	DATA_SOURCES = {
	"retraction_watch": "https://retractionwatch.com",
	"pubpeer": "https://pubpeer.com",
	"pubmed": "https://pubmed.ncbi.nlm.nih.gov",
	"arxiv": "https://arxiv.org",
	"semantic_scholar": "https://api.semanticscholar.org",
	"crossref": "https://api.crossref.org",
	}


	# ── Target distribution — 1000 papers total ──────────────────────────────────

	TARGET_DISTRIBUTION = {
	"CONFIRMED_FRAUD": 300, # from RetractionWatch
	"SUSPECTED_FRAUD": 200, # from PubPeer
	"BORDERLINE": 150, # gray area
	"CLEAN": 250, # normal good papers
	"BASELINE_ELITE": 100, # Nobel / landmark
	}


	@dataclass
	class PaperRecord:
	"""
	One row in SciPeerBench.
	Every paper labeled across all 14 fraud dimensions.
	This is the most comprehensive fraud labeling schema ever built.
	"""

	# ── Identity ──────────────────────────────────────────────────
	paper_id: str # SPB-0001, SPB-0002 ...
	doi: Optional[str]
	title: str
	authors: str # comma separated
	year: int
	journal: str
	source_db: str # where we got it

	# ── Ground truth ──────────────────────────────────────────────
	category: str # from PAPER_CATEGORIES
	is_fraud: int # 1 = fraud, 0 = clean
	fraud_confidence: float # 0.0 to 1.0
	fraud_types: str # comma separated FRAUD_TAXONOMY keys
	retraction_date: Optional[str] # YYYY-MM-DD
	retraction_reason: Optional[str]
	pubpeer_url: Optional[str]

	# ── 14 module scores — auto-generated by running our system ───
	stat_audit_score: float = 0.0
	figure_forensics_score: float = 0.0
	methodology_score: float = 0.0
	citation_score: float = 0.0
	reproducibility_score: float = 0.0
	novelty_score: float = 0.0
	grim_score: float = 0.0
	sprite_score: float = 0.0
	granularity_score: float = 0.0
	pcurve_score: float = 0.0
	effect_size_score: float = 0.0
	retraction_score: float = 0.0
	cartel_score: float = 0.0
	llm_score: float = 0.0

	# ── Weighted average of all 14 scores ─────────────────────────
	overall_risk_score: float = 0.0

	# ── Paper content ─────────────────────────────────────────────
	abstract_text: str = ""
	full_text_path: str = "" # path to saved full text

	# ── Metadata ──────────────────────────────────────────────────
	field_of_study: str = "" # biology, psychology, medicine...
	labeling_method: str = "" # auto, manual, auto+manual
	labeled_by: str = "auto"
	notes: str = ""


	# ── CSV column order — exact order in output file ────────────────────────────

	CSV_COLUMNS = [
	"paper_id", "doi", "title", "authors", "year",
	"journal", "source_db",
	"category", "is_fraud", "fraud_confidence",
	"fraud_types", "retraction_date", "retraction_reason", "pubpeer_url",
	"stat_audit_score", "figure_forensics_score", "methodology_score",
	"citation_score", "reproducibility_score", "novelty_score",
	"grim_score", "sprite_score", "granularity_score", "pcurve_score",
	"effect_size_score", "retraction_score", "cartel_score", "llm_score",
	"overall_risk_score",
	"abstract_text", "full_text_path",
	"field_of_study", "labeling_method", "labeled_by", "notes",
	]