Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Running

SciPeerAI-API / src /scipeerai /modules /reproducibility_scanner.py

Abu-Sameer-66

fix: add requests dependency — v2.3.0 hotfix

b625b53 about 16 hours ago

15.4 kB

	# src/scipeerai/modules/reproducibility_scanner.py
	#
	# Reproducibility Scanner
	# -----------------------
	# The reproducibility crisis exists largely because
	# researchers cannot access the code, data, and exact
	# methods used in published papers.
	#
	# This module scans paper text for reproducibility
	# signals — what is present and what is critically
	# missing for independent replication.

	import re
	from dataclasses import dataclass, field


	# ── data structures ───────────────────────────────────────────

	@dataclass
	class ReproducibilityFlag:
	flag_type: str
	severity: str
	description: str
	evidence: str
	suggestion: str


	@dataclass
	class ReproducibilityResult:
	# what was found
	has_code_link: bool
	has_data_link: bool
	has_software_versions: bool
	has_statistical_software: bool
	has_preregistration: bool
	has_ethics_statement: bool
	has_conflict_statement: bool
	has_sample_size_justification: bool

	# scoring
	reproducibility_score: float # 0.0 = not reproducible, 1.0 = fully
	flags: list
	risk_level: str
	summary: str


	# ── main class ────────────────────────────────────────────────

	class ReproducibilityScanner:
	"""
	Scans paper text for reproducibility indicators.

	Two layers:
	1. Presence checks — what good papers SHOULD have
	2. Absence flags — what is missing and how serious

	Scoring is inverted from other modules:
	HIGH reproducibility score = LOW risk.
	We report both for clarity.
	"""

	# code/data sharing signals
	CODE_PATTERNS = [
	r'github\.com/\S+',
	r'gitlab\.com/\S+',
	r'bitbucket\.org/\S+',
	r'code.available.at',
	r'code.provided.at',
	r'source code.*available',
	r'scripts.*available',
	r'zenodo\.org/\S+',
	r'osf\.io/\S+',
	r'code ocean',
	r'figshare\.com/\S+',
	]

	DATA_PATTERNS = [
	r'data.available.at',
	r'dataset.*available',
	r'data.*deposited',
	r'data.*repository',
	r'data.*doi',
	r'supplementary data',
	r'data.*provided',
	r'open data',
	r'zenodo\.org/\S+',
	r'osf\.io/\S+',
	r'dryad',
	r'figshare',
	r'harvard dataverse',
	r'data.upon.request', # weaker — noted separately
	]

	SOFTWARE_PATTERNS = [
	r'r\s+version\s+\d',
	r'python\s+\d+\.\d+',
	r'spss\s+version',
	r'stata\s+\d+',
	r'matlab\s+r\d+',
	r'sas\s+version',
	r'scipy\s+\d',
	r'numpy\s+\d',
	r'sklearn\s+\d',
	r'tensorflow\s+\d',
	r'pytorch\s+\d',
	]

	STAT_SOFTWARE = [
	'r software', 'rstudio', 'spss', 'stata',
	'sas', 'matlab', 'python', 'excel', 'graphpad'
	]

	PREREG_PATTERNS = [
	r'pre.?registered',
	r'preregistered',
	r'clinicaltrials\.gov',
	r'osf\.io',
	r'aspredicted\.org',
	r'registered report',
	r'trial registration',
	r'isrctn',
	r'anzctr',
	]

	def __init__(self):
	self._code_re = [re.compile(p, re.IGNORECASE) for p in self.CODE_PATTERNS]
	self._data_re = [re.compile(p, re.IGNORECASE) for p in self.DATA_PATTERNS]
	self._sw_re = [re.compile(p, re.IGNORECASE) for p in self.SOFTWARE_PATTERNS]
	self._prereg_re = [re.compile(p, re.IGNORECASE) for p in self.PREREG_PATTERNS]

	# ── public method ─────────────────────────────────────────

	def analyze(self, text: str) -> ReproducibilityResult:
	"""
	Full reproducibility scan.
	Returns what is present, what is missing, and risk level.
	"""
	t = text.lower()

	# presence checks
	has_code = self._check_patterns(text, self._code_re)
	has_data = self._check_patterns(text, self._data_re)
	has_sw_version = self._check_patterns(text, self._sw_re)
	has_stat_sw = any(sw in t for sw in self.STAT_SOFTWARE)
	has_prereg = self._check_patterns(text, self._prereg_re)
	has_ethics = self._has_ethics_statement(t)
	has_conflict = self._has_conflict_statement(t)
	has_n_justify = self._has_sample_size_justification(t)

	# build flags for what is missing
	flags = []
	flags.extend(self._flag_missing_code(has_code, t))
	flags.extend(self._flag_missing_data(has_data, t))
	flags.extend(self._flag_missing_software(has_sw_version, has_stat_sw, t))
	flags.extend(self._flag_missing_prereg(has_prereg, t))
	flags.extend(self._flag_missing_ethics(has_ethics, t))
	flags.extend(self._flag_data_on_request(text))

	# reproducibility score: percentage of key items present
	checklist = [
	has_code, has_data, has_sw_version,
	has_stat_sw, has_prereg, has_ethics,
	has_conflict, has_n_justify
	]
	repro_score = sum(checklist) / len(checklist)

	# risk is inverse of reproducibility
	risk_score = round(1.0 - repro_score, 3)
	risk_level = self._get_risk_level(risk_score)

	return ReproducibilityResult(
	has_code_link=has_code,
	has_data_link=has_data,
	has_software_versions=has_sw_version,
	has_statistical_software=has_stat_sw,
	has_preregistration=has_prereg,
	has_ethics_statement=has_ethics,
	has_conflict_statement=has_conflict,
	has_sample_size_justification=has_n_justify,
	reproducibility_score=round(repro_score, 3),
	flags=flags,
	risk_level=risk_level,
	summary=self._write_summary(
	repro_score, risk_level, flags,
	has_code, has_data
	),
	)

	# ── presence detectors ────────────────────────────────────

	def _check_patterns(self, text: str, patterns: list) -> bool:
	return any(p.search(text) for p in patterns)

	def _has_ethics_statement(self, text: str) -> bool:
	markers = [
	'ethics committee', 'institutional review board',
	'irb approval', 'ethics approval', 'ethical approval',
	'helsinki declaration', 'informed consent',
	'ethical clearance', 'ethics board'
	]
	return any(m in text for m in markers)

	def _has_conflict_statement(self, text: str) -> bool:
	markers = [
	'conflict of interest', 'competing interest',
	'no conflict', 'declare no', 'disclose',
	'funding source', 'financial disclosure'
	]
	return any(m in text for m in markers)

	def _has_sample_size_justification(self, text: str) -> bool:
	markers = [
	'power analysis', 'sample size calculation',
	'power calculation', 'statistical power',
	'a priori power', 'effect size calculation',
	'g*power', 'gpower'
	]
	return any(m in text for m in markers)

	# ── flag generators ───────────────────────────────────────

	def _flag_missing_code(self, has_code: bool, text: str) -> list:
	"""
	Code absence is critical for computational papers.
	We detect if the paper is computational first.
	"""
	flags = []
	is_computational = any(w in text for w in [
	'algorithm', 'code', 'software', 'script',
	'simulation', 'model', 'neural network',
	'machine learning', 'deep learning'
	])

	if is_computational and not has_code:
	flags.append(ReproducibilityFlag(
	flag_type="missing_code_availability",
	severity="high",
	description=(
	"Computational study does not provide a link to "
	"source code or analysis scripts. Independent "
	"replication is not possible without this."
	),
	evidence="Computational methods detected — no code link found",
	suggestion=(
	"Deposit code on GitHub/GitLab/Zenodo and include "
	"the URL in a 'Code Availability' section."
	),
	))
	return flags

	def _flag_missing_data(self, has_data: bool, text: str) -> list:
	flags = []
	has_empirical = any(w in text for w in [
	'dataset', 'data', 'sample', 'participants',
	'measurements', 'observations', 'collected'
	])

	if has_empirical and not has_data:
	flags.append(ReproducibilityFlag(
	flag_type="missing_data_availability",
	severity="high",
	description=(
	"Empirical study does not specify where raw data "
	"can be accessed. Results cannot be independently verified."
	),
	evidence="Empirical data detected — no data availability statement found",
	suggestion=(
	"Deposit raw data in a repository (OSF, Zenodo, Dryad, "
	"Harvard Dataverse) and include a Data Availability statement."
	),
	))
	return flags

	def _flag_missing_software(
	self, has_versions: bool, has_sw: bool, text: str
	) -> list:
	flags = []
	is_quantitative = any(w in text for w in [
	'statistical', 'analysis', 'test', 'regression',
	'anova', 'correlation', 't-test', 'chi-square'
	])

	if is_quantitative and not has_versions:
	flags.append(ReproducibilityFlag(
	flag_type="missing_software_versions",
	severity="medium",
	description=(
	"Statistical analysis performed but software name and "
	"version number not reported. Results may not replicate "
	"across different software versions."
	),
	evidence="Statistical analysis detected — no software version found",
	suggestion=(
	"Specify the exact software and version used "
	"(e.g., 'R version 4.3.1', 'Python 3.10.12 with "
	"scikit-learn 1.3.0')."
	),
	))
	return flags

	def _flag_missing_prereg(self, has_prereg: bool, text: str) -> list:
	flags = []
	is_clinical_or_experimental = any(w in text for w in [
	'clinical trial', 'randomized', 'experiment',
	'intervention', 'treatment', 'placebo',
	'hypothesis', 'we predicted', 'we hypothesized'
	])

	if is_clinical_or_experimental and not has_prereg:
	flags.append(ReproducibilityFlag(
	flag_type="missing_preregistration",
	severity="medium",
	description=(
	"Experimental or clinical study with no preregistration "
	"detected. Without preregistration, it is difficult to "
	"distinguish confirmatory from exploratory analyses."
	),
	evidence="Experimental design detected — no preregistration link",
	suggestion=(
	"For future studies, preregister hypotheses on OSF "
	"(osf.io) or ClinicalTrials.gov before data collection."
	),
	))
	return flags

	def _flag_missing_ethics(self, has_ethics: bool, text: str) -> list:
	flags = []
	involves_humans = any(w in text for w in [
	'participants', 'subjects', 'patients', 'volunteers',
	'respondents', 'human', 'children', 'adults'
	])

	if involves_humans and not has_ethics:
	flags.append(ReproducibilityFlag(
	flag_type="missing_ethics_statement",
	severity="high",
	description=(
	"Human participants study with no ethics approval "
	"or IRB statement detected. This is required by "
	"most journals and funding bodies."
	),
	evidence="Human participants detected — no ethics statement found",
	suggestion=(
	"Include an Ethics Statement specifying the approving "
	"body, protocol number, and that informed consent was obtained."
	),
	))
	return flags

	def _flag_data_on_request(self, text: str) -> list:
	"""
	'Data available upon request' is widely considered
	a reproducibility red flag — studies show that
	most such requests are never fulfilled.
	"""
	flags = []
	if re.search(
	r'data.available.upon.*request\|'
	r'data.available.on.*request\|'
	r'available.from.corresponding.*author',
	text, re.IGNORECASE
	):
	flags.append(ReproducibilityFlag(
	flag_type="data_available_on_request",
	severity="medium",
	description=(
	"'Data available upon request' is a reproducibility "
	"risk. Research shows that over 80% of such requests "
	"go unfulfilled or receive no response."
	),
	evidence="'Data available upon request' language detected",
	suggestion=(
	"Deposit data in a public repository instead. "
	"This increases citation rates and research trust."
	),
	))
	return flags

	# ── scoring ───────────────────────────────────────────────

	def _get_risk_level(self, risk_score: float) -> str:
	if risk_score >= 0.7: return "critical"
	elif risk_score >= 0.4: return "high"
	elif risk_score >= 0.2: return "medium"
	return "low"

	def _write_summary(
	self,
	repro_score: float,
	risk_level: str,
	flags: list,
	has_code: bool,
	has_data: bool,
	) -> str:
	pct = round(repro_score * 100)

	if not flags:
	return (
	f"Reproducibility score: {pct}%. "
	f"All key reproducibility indicators detected."
	)

	missing = []
	if not has_code: missing.append("code")
	if not has_data: missing.append("data")

	high = sum(1 for f in flags if f.severity == "high")
	med = sum(1 for f in flags if f.severity == "medium")

	parts = []
	if high: parts.append(f"{high} critical gap{'s' if high > 1 else ''}")
	if med: parts.append(f"{med} concern{'s' if med > 1 else ''}")

	return (
	f"Reproducibility score: {pct}%. "
	f"Flagged {', '.join(parts)}. "
	f"Risk level: {risk_level.upper()}."
	)