Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Sleeping

File size: 10,849 Bytes

c94f46f

# Statistical Audit Module
# ------------------------
# This is where we catch the kind of statistical
# manipulation that slips past human reviewers.
#
# Three main things we look for:
#   1. p-values clustered suspiciously near 0.05
#   2. Sample sizes too small to trust the results
#   3. Numbers that look "too clean" to be real data

import re
from dataclasses import dataclass, field


# ── data structures ──────────────────────────────────────────

@dataclass
class StatFlag:
    # one issue we found
    flag_type: str
    severity: str        # "high", "medium", "low"
    description: str
    evidence: str        # the actual text/number that triggered this
    suggestion: str


@dataclass
class StatAuditResult:
    p_values_found: list
    sample_sizes_found: list
    flags: list
    risk_score: float       # 0.0 to 1.0
    risk_level: str         # "low" / "medium" / "high" / "critical"
    summary: str


# ── main class ───────────────────────────────────────────────

class StatAuditEngine:
    """
    Scans paper text for statistical red flags.

    I wrote this as a class because later we'll want to
    configure thresholds differently for different fields —
    medicine needs stricter p-value cutoffs than psychology,
    for instance.
    """

    # p-values this close to 0.05 are suspicious
    # real results don't magically cluster right at the cutoff
    P_HACK_ZONE = (0.04, 0.051)

    # below this sample size, most findings are unreliable
    MIN_SAMPLE_SIZE = 30

    def __init__(self):
        # regex for p-values — catches things like:
        # p=0.04, p < 0.001, p-value = 0.032, (p=.049)
        self._p_pattern = re.compile(
            r'p\s*[=<>≤≥]\s*\.?(\d+\.?\d*)',
            re.IGNORECASE
        )

        # regex for sample sizes — catches n=50, N = 120, n=32 etc
        self._n_pattern = re.compile(
            r'\bn\s*=\s*(\d+)',
            re.IGNORECASE
        )

        # t-statistics, F-statistics, chi-square values
        self._tstat_pattern = re.compile(
            r't\s*[=\(]\s*(\d+\.?\d*)',
            re.IGNORECASE
        )

    # ── public method ─────────────────────────────────────────

    def analyze(self, text: str) -> StatAuditResult:
        """
        Main entry point. Give it the paper text, get back
        a full audit report.
        """
        p_values = self._extract_p_values(text)
        sample_sizes = self._extract_sample_sizes(text)

        flags = []
        flags.extend(self._check_p_hacking(p_values))
        flags.extend(self._check_sample_sizes(sample_sizes))
        flags.extend(self._check_round_numbers(p_values))
        flags.extend(self._check_p_value_absence(text, sample_sizes))

        risk_score = self._calculate_risk(flags)
        risk_level = self._get_risk_level(risk_score)

        return StatAuditResult(
            p_values_found=p_values,
            sample_sizes_found=sample_sizes,
            flags=flags,
            risk_score=round(risk_score, 3),
            risk_level=risk_level,
            summary=self._write_summary(flags, risk_level),
        )

    # ── extraction helpers ────────────────────────────────────

    def _extract_p_values(self, text: str) -> list:
        matches = self._p_pattern.findall(text)
        values = []
        for m in matches:
            try:
                val = float(m)
                if 0.0 < val <= 1.0:   # must be a valid probability
                    values.append(val)
            except ValueError:
                pass
        return values

    def _extract_sample_sizes(self, text: str) -> list:
        matches = self._n_pattern.findall(text)
        sizes = []
        for m in matches:
            try:
                sizes.append(int(m))
            except ValueError:
                pass
        return sizes

    # ── flag checks ───────────────────────────────────────────

    def _check_p_hacking(self, p_values: list) -> list:
        """
        Look for p-values suspiciously clustered just below 0.05.
        If more than 40% of reported p-values live in this tiny window,
        something probably went wrong in the analysis.
        """
        flags = []
        if not p_values:
            return flags

        low, high = self.P_HACK_ZONE
        borderline = [p for p in p_values if low <= p <= high]
        ratio = len(borderline) / len(p_values)

        if ratio >= 0.6 and len(borderline) >= 3:
            flags.append(StatFlag(
                flag_type="p_hacking_suspected",
                severity="high",
                description=(
                    f"{len(borderline)} out of {len(p_values)} reported "
                    f"p-values fall between {low} and {high}. "
                    f"That's {round(ratio*100)}% clustered right at "
                    f"the significance threshold."
                ),
                evidence=str(borderline),
                suggestion=(
                    "Check whether all conducted analyses are reported. "
                    "Selective reporting inflates this pattern."
                ),
            ))
        elif ratio >= 0.4 and len(borderline) >= 2:
            flags.append(StatFlag(
                flag_type="borderline_p_values",
                severity="medium",
                description=(
                    f"{len(borderline)} p-values near the 0.05 cutoff. "
                    f"Worth a closer look at the analysis pipeline."
                ),
                evidence=str(borderline),
                suggestion="Request full analysis scripts and pre-registration info.",
            ))

        return flags

    def _check_sample_sizes(self, sample_sizes: list) -> list:
        """
        Tiny sample sizes mean the results probably won't replicate.
        Below n=30 is a concern in most quantitative fields.
        """
        flags = []
        small = [n for n in sample_sizes if 0 < n < self.MIN_SAMPLE_SIZE]

        if small:
            flags.append(StatFlag(
                flag_type="small_sample_size",
                severity="high" if min(small) < 15 else "medium",
                description=(
                    f"Sample size(s) below recommended minimum: {small}. "
                    f"Studies with n < {self.MIN_SAMPLE_SIZE} are typically "
                    f"underpowered for reliable inference."
                ),
                evidence=str(small),
                suggestion=(
                    "A post-hoc power analysis would clarify whether "
                    "the study had sufficient power to detect the claimed effects."
                ),
            ))

        return flags

    def _check_round_numbers(self, p_values: list) -> list:
        """
        Real data rarely produces perfectly round p-values.
        p = 0.05 exactly is almost impossible to get naturally.
        p = 0.049 right at the boundary is also suspicious.
        """
        flags = []
        suspicious = []

        for p in p_values:
            # exact boundary value
            if p == 0.05:
                suspicious.append(p)
            # suspiciously precise cutoff-hugging
            elif p in (0.049, 0.001, 0.01):
                suspicious.append(p)

        if suspicious:
            flags.append(StatFlag(
                flag_type="suspiciously_round_p_values",
                severity="medium",
                description=(
                    f"Found p-values that are unusually precise "
                    f"or exactly at significance boundaries: {suspicious}"
                ),
                evidence=str(suspicious),
                suggestion=(
                    "Request raw data to verify these values. "
                    "Exact boundary values sometimes indicate rounding "
                    "or post-hoc adjustment."
                ),
            ))

        return flags

    def _check_p_value_absence(self, text: str, sample_sizes: list) -> list:
        """
        If a paper reports results with sample sizes but no p-values,
        it's avoiding statistical scrutiny — also a red flag.
        """
        flags = []
        has_stats_claim = any(
            phrase in text.lower()
            for phrase in ["significant", "effect", "difference", "result"]
        )
        p_mentions = len(self._p_pattern.findall(text))

        if sample_sizes and has_stats_claim and p_mentions == 0:
            flags.append(StatFlag(
                flag_type="missing_statistical_tests",
                severity="high",
                description=(
                    "Paper makes statistical claims but reports no p-values "
                    "or test statistics. Results cannot be independently evaluated."
                ),
                evidence="No p-values found despite significance claims",
                suggestion="Request full statistical output tables from authors.",
            ))

        return flags

    # ── scoring ───────────────────────────────────────────────

    def _calculate_risk(self, flags: list) -> float:
        """
        Weighted scoring — high severity flags count more.
        Capped at 1.0 so the score stays interpretable.
        """
        weights = {"high": 0.35, "medium": 0.20, "low": 0.08}
        score = sum(weights.get(f.severity, 0) for f in flags)
        return min(score, 1.0)

    def _get_risk_level(self, score: float) -> str:
        if score >= 0.7:
            return "critical"
        elif score >= 0.4:
            return "high"
        elif score >= 0.2:
            return "medium"
        return "low"

    def _write_summary(self, flags: list, risk_level: str) -> str:
        if not flags:
            return (
                "No statistical anomalies detected. "
                "Standard metrics appear within normal ranges."
            )

        high = sum(1 for f in flags if f.severity == "high")
        med  = sum(1 for f in flags if f.severity == "medium")

        parts = []
        if high:
            parts.append(f"{high} high-severity issue{'s' if high > 1 else ''}")
        if med:
            parts.append(f"{med} medium-severity concern{'s' if med > 1 else ''}")

        return (
            f"Statistical audit flagged {', '.join(parts)}. "
            f"Overall risk level: {risk_level.upper()}."
        )