Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Running

File size: 15,446 Bytes

b5c2bb1

# src/scipeerai/modules/reproducibility_scanner.py
#
# Reproducibility Scanner
# -----------------------
# The reproducibility crisis exists largely because
# researchers cannot access the code, data, and exact
# methods used in published papers.
#
# This module scans paper text for reproducibility
# signals — what is present and what is critically
# missing for independent replication.

import re
from dataclasses import dataclass, field


# ── data structures ───────────────────────────────────────────

@dataclass
class ReproducibilityFlag:
    flag_type: str
    severity: str
    description: str
    evidence: str
    suggestion: str


@dataclass
class ReproducibilityResult:
    # what was found
    has_code_link: bool
    has_data_link: bool
    has_software_versions: bool
    has_statistical_software: bool
    has_preregistration: bool
    has_ethics_statement: bool
    has_conflict_statement: bool
    has_sample_size_justification: bool

    # scoring
    reproducibility_score: float   # 0.0 = not reproducible, 1.0 = fully
    flags: list
    risk_level: str
    summary: str


# ── main class ────────────────────────────────────────────────

class ReproducibilityScanner:
    """
    Scans paper text for reproducibility indicators.

    Two layers:
    1. Presence checks — what good papers SHOULD have
    2. Absence flags — what is missing and how serious

    Scoring is inverted from other modules:
    HIGH reproducibility score = LOW risk.
    We report both for clarity.
    """

    # code/data sharing signals
    CODE_PATTERNS = [
        r'github\.com/\S+',
        r'gitlab\.com/\S+',
        r'bitbucket\.org/\S+',
        r'code.*available.*at',
        r'code.*provided.*at',
        r'source code.*available',
        r'scripts.*available',
        r'zenodo\.org/\S+',
        r'osf\.io/\S+',
        r'code ocean',
        r'figshare\.com/\S+',
    ]

    DATA_PATTERNS = [
        r'data.*available.*at',
        r'dataset.*available',
        r'data.*deposited',
        r'data.*repository',
        r'data.*doi',
        r'supplementary data',
        r'data.*provided',
        r'open data',
        r'zenodo\.org/\S+',
        r'osf\.io/\S+',
        r'dryad',
        r'figshare',
        r'harvard dataverse',
        r'data.*upon.*request',  # weaker — noted separately
    ]

    SOFTWARE_PATTERNS = [
        r'r\s+version\s+\d',
        r'python\s+\d+\.\d+',
        r'spss\s+version',
        r'stata\s+\d+',
        r'matlab\s+r\d+',
        r'sas\s+version',
        r'scipy\s+\d',
        r'numpy\s+\d',
        r'sklearn\s+\d',
        r'tensorflow\s+\d',
        r'pytorch\s+\d',
    ]

    STAT_SOFTWARE = [
        'r software', 'rstudio', 'spss', 'stata',
        'sas', 'matlab', 'python', 'excel', 'graphpad'
    ]

    PREREG_PATTERNS = [
        r'pre.?registered',
        r'preregistered',
        r'clinicaltrials\.gov',
        r'osf\.io',
        r'aspredicted\.org',
        r'registered report',
        r'trial registration',
        r'isrctn',
        r'anzctr',
    ]

    def __init__(self):
        self._code_re    = [re.compile(p, re.IGNORECASE) for p in self.CODE_PATTERNS]
        self._data_re    = [re.compile(p, re.IGNORECASE) for p in self.DATA_PATTERNS]
        self._sw_re      = [re.compile(p, re.IGNORECASE) for p in self.SOFTWARE_PATTERNS]
        self._prereg_re  = [re.compile(p, re.IGNORECASE) for p in self.PREREG_PATTERNS]

    # ── public method ─────────────────────────────────────────

    def analyze(self, text: str) -> ReproducibilityResult:
        """
        Full reproducibility scan.
        Returns what is present, what is missing, and risk level.
        """
        t = text.lower()

        # presence checks
        has_code       = self._check_patterns(text, self._code_re)
        has_data       = self._check_patterns(text, self._data_re)
        has_sw_version = self._check_patterns(text, self._sw_re)
        has_stat_sw    = any(sw in t for sw in self.STAT_SOFTWARE)
        has_prereg     = self._check_patterns(text, self._prereg_re)
        has_ethics     = self._has_ethics_statement(t)
        has_conflict   = self._has_conflict_statement(t)
        has_n_justify  = self._has_sample_size_justification(t)

        # build flags for what is missing
        flags = []
        flags.extend(self._flag_missing_code(has_code, t))
        flags.extend(self._flag_missing_data(has_data, t))
        flags.extend(self._flag_missing_software(has_sw_version, has_stat_sw, t))
        flags.extend(self._flag_missing_prereg(has_prereg, t))
        flags.extend(self._flag_missing_ethics(has_ethics, t))
        flags.extend(self._flag_data_on_request(text))

        # reproducibility score: percentage of key items present
        checklist = [
            has_code, has_data, has_sw_version,
            has_stat_sw, has_prereg, has_ethics,
            has_conflict, has_n_justify
        ]
        repro_score = sum(checklist) / len(checklist)

        # risk is inverse of reproducibility
        risk_score = round(1.0 - repro_score, 3)
        risk_level = self._get_risk_level(risk_score)

        return ReproducibilityResult(
            has_code_link=has_code,
            has_data_link=has_data,
            has_software_versions=has_sw_version,
            has_statistical_software=has_stat_sw,
            has_preregistration=has_prereg,
            has_ethics_statement=has_ethics,
            has_conflict_statement=has_conflict,
            has_sample_size_justification=has_n_justify,
            reproducibility_score=round(repro_score, 3),
            flags=flags,
            risk_level=risk_level,
            summary=self._write_summary(
                repro_score, risk_level, flags,
                has_code, has_data
            ),
        )

    # ── presence detectors ────────────────────────────────────

    def _check_patterns(self, text: str, patterns: list) -> bool:
        return any(p.search(text) for p in patterns)

    def _has_ethics_statement(self, text: str) -> bool:
        markers = [
            'ethics committee', 'institutional review board',
            'irb approval', 'ethics approval', 'ethical approval',
            'helsinki declaration', 'informed consent',
            'ethical clearance', 'ethics board'
        ]
        return any(m in text for m in markers)

    def _has_conflict_statement(self, text: str) -> bool:
        markers = [
            'conflict of interest', 'competing interest',
            'no conflict', 'declare no', 'disclose',
            'funding source', 'financial disclosure'
        ]
        return any(m in text for m in markers)

    def _has_sample_size_justification(self, text: str) -> bool:
        markers = [
            'power analysis', 'sample size calculation',
            'power calculation', 'statistical power',
            'a priori power', 'effect size calculation',
            'g*power', 'gpower'
        ]
        return any(m in text for m in markers)

    # ── flag generators ───────────────────────────────────────

    def _flag_missing_code(self, has_code: bool, text: str) -> list:
        """
        Code absence is critical for computational papers.
        We detect if the paper is computational first.
        """
        flags = []
        is_computational = any(w in text for w in [
            'algorithm', 'code', 'software', 'script',
            'simulation', 'model', 'neural network',
            'machine learning', 'deep learning'
        ])

        if is_computational and not has_code:
            flags.append(ReproducibilityFlag(
                flag_type="missing_code_availability",
                severity="high",
                description=(
                    "Computational study does not provide a link to "
                    "source code or analysis scripts. Independent "
                    "replication is not possible without this."
                ),
                evidence="Computational methods detected — no code link found",
                suggestion=(
                    "Deposit code on GitHub/GitLab/Zenodo and include "
                    "the URL in a 'Code Availability' section."
                ),
            ))
        return flags

    def _flag_missing_data(self, has_data: bool, text: str) -> list:
        flags = []
        has_empirical = any(w in text for w in [
            'dataset', 'data', 'sample', 'participants',
            'measurements', 'observations', 'collected'
        ])

        if has_empirical and not has_data:
            flags.append(ReproducibilityFlag(
                flag_type="missing_data_availability",
                severity="high",
                description=(
                    "Empirical study does not specify where raw data "
                    "can be accessed. Results cannot be independently verified."
                ),
                evidence="Empirical data detected — no data availability statement found",
                suggestion=(
                    "Deposit raw data in a repository (OSF, Zenodo, Dryad, "
                    "Harvard Dataverse) and include a Data Availability statement."
                ),
            ))
        return flags

    def _flag_missing_software(
        self, has_versions: bool, has_sw: bool, text: str
    ) -> list:
        flags = []
        is_quantitative = any(w in text for w in [
            'statistical', 'analysis', 'test', 'regression',
            'anova', 'correlation', 't-test', 'chi-square'
        ])

        if is_quantitative and not has_versions:
            flags.append(ReproducibilityFlag(
                flag_type="missing_software_versions",
                severity="medium",
                description=(
                    "Statistical analysis performed but software name and "
                    "version number not reported. Results may not replicate "
                    "across different software versions."
                ),
                evidence="Statistical analysis detected — no software version found",
                suggestion=(
                    "Specify the exact software and version used "
                    "(e.g., 'R version 4.3.1', 'Python 3.10.12 with "
                    "scikit-learn 1.3.0')."
                ),
            ))
        return flags

    def _flag_missing_prereg(self, has_prereg: bool, text: str) -> list:
        flags = []
        is_clinical_or_experimental = any(w in text for w in [
            'clinical trial', 'randomized', 'experiment',
            'intervention', 'treatment', 'placebo',
            'hypothesis', 'we predicted', 'we hypothesized'
        ])

        if is_clinical_or_experimental and not has_prereg:
            flags.append(ReproducibilityFlag(
                flag_type="missing_preregistration",
                severity="medium",
                description=(
                    "Experimental or clinical study with no preregistration "
                    "detected. Without preregistration, it is difficult to "
                    "distinguish confirmatory from exploratory analyses."
                ),
                evidence="Experimental design detected — no preregistration link",
                suggestion=(
                    "For future studies, preregister hypotheses on OSF "
                    "(osf.io) or ClinicalTrials.gov before data collection."
                ),
            ))
        return flags

    def _flag_missing_ethics(self, has_ethics: bool, text: str) -> list:
        flags = []
        involves_humans = any(w in text for w in [
            'participants', 'subjects', 'patients', 'volunteers',
            'respondents', 'human', 'children', 'adults'
        ])

        if involves_humans and not has_ethics:
            flags.append(ReproducibilityFlag(
                flag_type="missing_ethics_statement",
                severity="high",
                description=(
                    "Human participants study with no ethics approval "
                    "or IRB statement detected. This is required by "
                    "most journals and funding bodies."
                ),
                evidence="Human participants detected — no ethics statement found",
                suggestion=(
                    "Include an Ethics Statement specifying the approving "
                    "body, protocol number, and that informed consent was obtained."
                ),
            ))
        return flags

    def _flag_data_on_request(self, text: str) -> list:
        """
        'Data available upon request' is widely considered
        a reproducibility red flag — studies show that
        most such requests are never fulfilled.
        """
        flags = []
        if re.search(
            r'data.*available.*upon.*request|'
            r'data.*available.*on.*request|'
            r'available.*from.*corresponding.*author',
            text, re.IGNORECASE
        ):
            flags.append(ReproducibilityFlag(
                flag_type="data_available_on_request",
                severity="medium",
                description=(
                    "'Data available upon request' is a reproducibility "
                    "risk. Research shows that over 80% of such requests "
                    "go unfulfilled or receive no response."
                ),
                evidence="'Data available upon request' language detected",
                suggestion=(
                    "Deposit data in a public repository instead. "
                    "This increases citation rates and research trust."
                ),
            ))
        return flags

    # ── scoring ───────────────────────────────────────────────

    def _get_risk_level(self, risk_score: float) -> str:
        if risk_score >= 0.7:   return "critical"
        elif risk_score >= 0.4: return "high"
        elif risk_score >= 0.2: return "medium"
        return "low"

    def _write_summary(
        self,
        repro_score: float,
        risk_level: str,
        flags: list,
        has_code: bool,
        has_data: bool,
    ) -> str:
        pct = round(repro_score * 100)

        if not flags:
            return (
                f"Reproducibility score: {pct}%. "
                f"All key reproducibility indicators detected."
            )

        missing = []
        if not has_code: missing.append("code")
        if not has_data: missing.append("data")

        high = sum(1 for f in flags if f.severity == "high")
        med  = sum(1 for f in flags if f.severity == "medium")

        parts = []
        if high: parts.append(f"{high} critical gap{'s' if high > 1 else ''}")
        if med:  parts.append(f"{med} concern{'s' if med > 1 else ''}")

        return (
            f"Reproducibility score: {pct}%. "
            f"Flagged {', '.join(parts)}. "
            f"Risk level: {risk_level.upper()}."
        )