Spaces:
Sleeping
Sleeping
| # Statistical Audit Module | |
| # ------------------------ | |
| # This is where we catch the kind of statistical | |
| # manipulation that slips past human reviewers. | |
| # | |
| # Three main things we look for: | |
| # 1. p-values clustered suspiciously near 0.05 | |
| # 2. Sample sizes too small to trust the results | |
| # 3. Numbers that look "too clean" to be real data | |
| import re | |
| from dataclasses import dataclass, field | |
| # ββ data structures ββββββββββββββββββββββββββββββββββββββββββ | |
| class StatFlag: | |
| # one issue we found | |
| flag_type: str | |
| severity: str # "high", "medium", "low" | |
| description: str | |
| evidence: str # the actual text/number that triggered this | |
| suggestion: str | |
| class StatAuditResult: | |
| p_values_found: list | |
| sample_sizes_found: list | |
| flags: list | |
| risk_score: float # 0.0 to 1.0 | |
| risk_level: str # "low" / "medium" / "high" / "critical" | |
| summary: str | |
| # ββ main class βββββββββββββββββββββββββββββββββββββββββββββββ | |
| class StatAuditEngine: | |
| """ | |
| Scans paper text for statistical red flags. | |
| I wrote this as a class because later we'll want to | |
| configure thresholds differently for different fields β | |
| medicine needs stricter p-value cutoffs than psychology, | |
| for instance. | |
| """ | |
| # p-values this close to 0.05 are suspicious | |
| # real results don't magically cluster right at the cutoff | |
| P_HACK_ZONE = (0.04, 0.051) | |
| # below this sample size, most findings are unreliable | |
| MIN_SAMPLE_SIZE = 30 | |
| def __init__(self): | |
| # regex for p-values β catches things like: | |
| # p=0.04, p < 0.001, p-value = 0.032, (p=.049) | |
| self._p_pattern = re.compile( | |
| r'p\s*[=<>β€β₯]\s*\.?(\d+\.?\d*)', | |
| re.IGNORECASE | |
| ) | |
| # regex for sample sizes β catches n=50, N = 120, n=32 etc | |
| self._n_pattern = re.compile( | |
| r'\bn\s*=\s*(\d+)', | |
| re.IGNORECASE | |
| ) | |
| # t-statistics, F-statistics, chi-square values | |
| self._tstat_pattern = re.compile( | |
| r't\s*[=\(]\s*(\d+\.?\d*)', | |
| re.IGNORECASE | |
| ) | |
| # ββ public method βββββββββββββββββββββββββββββββββββββββββ | |
| def analyze(self, text: str) -> StatAuditResult: | |
| """ | |
| Main entry point. Give it the paper text, get back | |
| a full audit report. | |
| """ | |
| p_values = self._extract_p_values(text) | |
| sample_sizes = self._extract_sample_sizes(text) | |
| flags = [] | |
| flags.extend(self._check_p_hacking(p_values)) | |
| flags.extend(self._check_sample_sizes(sample_sizes)) | |
| flags.extend(self._check_round_numbers(p_values)) | |
| flags.extend(self._check_p_value_absence(text, sample_sizes)) | |
| risk_score = self._calculate_risk(flags) | |
| risk_level = self._get_risk_level(risk_score) | |
| return StatAuditResult( | |
| p_values_found=p_values, | |
| sample_sizes_found=sample_sizes, | |
| flags=flags, | |
| risk_score=round(risk_score, 3), | |
| risk_level=risk_level, | |
| summary=self._write_summary(flags, risk_level), | |
| ) | |
| # ββ extraction helpers ββββββββββββββββββββββββββββββββββββ | |
| def _extract_p_values(self, text: str) -> list: | |
| matches = self._p_pattern.findall(text) | |
| values = [] | |
| for m in matches: | |
| try: | |
| val = float(m) | |
| if 0.0 < val <= 1.0: # must be a valid probability | |
| values.append(val) | |
| except ValueError: | |
| pass | |
| return values | |
| def _extract_sample_sizes(self, text: str) -> list: | |
| matches = self._n_pattern.findall(text) | |
| sizes = [] | |
| for m in matches: | |
| try: | |
| sizes.append(int(m)) | |
| except ValueError: | |
| pass | |
| return sizes | |
| # ββ flag checks βββββββββββββββββββββββββββββββββββββββββββ | |
| def _check_p_hacking(self, p_values: list) -> list: | |
| """ | |
| Look for p-values suspiciously clustered just below 0.05. | |
| If more than 40% of reported p-values live in this tiny window, | |
| something probably went wrong in the analysis. | |
| """ | |
| flags = [] | |
| if not p_values: | |
| return flags | |
| low, high = self.P_HACK_ZONE | |
| borderline = [p for p in p_values if low <= p <= high] | |
| ratio = len(borderline) / len(p_values) | |
| if ratio >= 0.6 and len(borderline) >= 3: | |
| flags.append(StatFlag( | |
| flag_type="p_hacking_suspected", | |
| severity="high", | |
| description=( | |
| f"{len(borderline)} out of {len(p_values)} reported " | |
| f"p-values fall between {low} and {high}. " | |
| f"That's {round(ratio*100)}% clustered right at " | |
| f"the significance threshold." | |
| ), | |
| evidence=str(borderline), | |
| suggestion=( | |
| "Check whether all conducted analyses are reported. " | |
| "Selective reporting inflates this pattern." | |
| ), | |
| )) | |
| elif ratio >= 0.4 and len(borderline) >= 2: | |
| flags.append(StatFlag( | |
| flag_type="borderline_p_values", | |
| severity="medium", | |
| description=( | |
| f"{len(borderline)} p-values near the 0.05 cutoff. " | |
| f"Worth a closer look at the analysis pipeline." | |
| ), | |
| evidence=str(borderline), | |
| suggestion="Request full analysis scripts and pre-registration info.", | |
| )) | |
| return flags | |
| def _check_sample_sizes(self, sample_sizes: list) -> list: | |
| """ | |
| Tiny sample sizes mean the results probably won't replicate. | |
| Below n=30 is a concern in most quantitative fields. | |
| """ | |
| flags = [] | |
| small = [n for n in sample_sizes if 0 < n < self.MIN_SAMPLE_SIZE] | |
| if small: | |
| flags.append(StatFlag( | |
| flag_type="small_sample_size", | |
| severity="high" if min(small) < 15 else "medium", | |
| description=( | |
| f"Sample size(s) below recommended minimum: {small}. " | |
| f"Studies with n < {self.MIN_SAMPLE_SIZE} are typically " | |
| f"underpowered for reliable inference." | |
| ), | |
| evidence=str(small), | |
| suggestion=( | |
| "A post-hoc power analysis would clarify whether " | |
| "the study had sufficient power to detect the claimed effects." | |
| ), | |
| )) | |
| return flags | |
| def _check_round_numbers(self, p_values: list) -> list: | |
| """ | |
| Real data rarely produces perfectly round p-values. | |
| p = 0.05 exactly is almost impossible to get naturally. | |
| p = 0.049 right at the boundary is also suspicious. | |
| """ | |
| flags = [] | |
| suspicious = [] | |
| for p in p_values: | |
| # exact boundary value | |
| if p == 0.05: | |
| suspicious.append(p) | |
| # suspiciously precise cutoff-hugging | |
| elif p in (0.049, 0.001, 0.01): | |
| suspicious.append(p) | |
| if suspicious: | |
| flags.append(StatFlag( | |
| flag_type="suspiciously_round_p_values", | |
| severity="medium", | |
| description=( | |
| f"Found p-values that are unusually precise " | |
| f"or exactly at significance boundaries: {suspicious}" | |
| ), | |
| evidence=str(suspicious), | |
| suggestion=( | |
| "Request raw data to verify these values. " | |
| "Exact boundary values sometimes indicate rounding " | |
| "or post-hoc adjustment." | |
| ), | |
| )) | |
| return flags | |
| def _check_p_value_absence(self, text: str, sample_sizes: list) -> list: | |
| """ | |
| If a paper reports results with sample sizes but no p-values, | |
| it's avoiding statistical scrutiny β also a red flag. | |
| """ | |
| flags = [] | |
| has_stats_claim = any( | |
| phrase in text.lower() | |
| for phrase in ["significant", "effect", "difference", "result"] | |
| ) | |
| p_mentions = len(self._p_pattern.findall(text)) | |
| if sample_sizes and has_stats_claim and p_mentions == 0: | |
| flags.append(StatFlag( | |
| flag_type="missing_statistical_tests", | |
| severity="high", | |
| description=( | |
| "Paper makes statistical claims but reports no p-values " | |
| "or test statistics. Results cannot be independently evaluated." | |
| ), | |
| evidence="No p-values found despite significance claims", | |
| suggestion="Request full statistical output tables from authors.", | |
| )) | |
| return flags | |
| # ββ scoring βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _calculate_risk(self, flags: list) -> float: | |
| """ | |
| Weighted scoring β high severity flags count more. | |
| Capped at 1.0 so the score stays interpretable. | |
| """ | |
| weights = {"high": 0.35, "medium": 0.20, "low": 0.08} | |
| score = sum(weights.get(f.severity, 0) for f in flags) | |
| return min(score, 1.0) | |
| def _get_risk_level(self, score: float) -> str: | |
| if score >= 0.7: | |
| return "critical" | |
| elif score >= 0.4: | |
| return "high" | |
| elif score >= 0.2: | |
| return "medium" | |
| return "low" | |
| def _write_summary(self, flags: list, risk_level: str) -> str: | |
| if not flags: | |
| return ( | |
| "No statistical anomalies detected. " | |
| "Standard metrics appear within normal ranges." | |
| ) | |
| high = sum(1 for f in flags if f.severity == "high") | |
| med = sum(1 for f in flags if f.severity == "medium") | |
| parts = [] | |
| if high: | |
| parts.append(f"{high} high-severity issue{'s' if high > 1 else ''}") | |
| if med: | |
| parts.append(f"{med} medium-severity concern{'s' if med > 1 else ''}") | |
| return ( | |
| f"Statistical audit flagged {', '.join(parts)}. " | |
| f"Overall risk level: {risk_level.upper()}." | |
| ) |