Spaces:
Running
Running
| # src/scipeerai/modules/effect_size_validator.py | |
| # | |
| # Effect Size Validator | |
| # Extracts and validates Cohen's d, r, eta-squared, | |
| # odds ratios, and performs post-hoc power analysis. | |
| # | |
| # Small N + large effect size = fabrication signal. | |
| # Underpowered studies with significant results = suspect. | |
| import re | |
| import math | |
| from dataclasses import dataclass, field | |
| class EffectSizeFlag: | |
| flag_type: str | |
| severity: str | |
| description: str | |
| evidence: str | |
| suggestion: str | |
| class EffectSizeResult: | |
| effect_sizes_found: list | |
| power_estimates: list | |
| inflated_effects: list | |
| underpowered: list | |
| effect_score: float | |
| risk_level: str | |
| summary: str | |
| flags: list = field(default_factory=list) | |
| flags_count: int = 0 | |
| class EffectSizeValidator: | |
| """ | |
| Effect Size Validator. | |
| Validates reported effect sizes against sample sizes. | |
| Detects inflated effects and underpowered studies. | |
| Key insight: | |
| - Real large effects (d>0.8) need N>50 to be credible | |
| - Small N + large effect = likely false positive | |
| - Significant result + low power = suspicious | |
| """ | |
| # Cohen's d pattern | |
| COHENS_D = re.compile( | |
| r"cohen['\s]?s?\s*d\s*[=:]\s*(-?\d+\.?\d*)", | |
| re.IGNORECASE | |
| ) | |
| # Pearson r | |
| PEARSON_R = re.compile( | |
| r"\br\s*[=:]\s*(-?0?\.\d+)", | |
| re.IGNORECASE | |
| ) | |
| # Eta squared | |
| ETA_SQ = re.compile( | |
| r"eta[Β²2\s-]*squared?\s*[=:]\s*(0?\.\d+)", | |
| re.IGNORECASE | |
| ) | |
| # Omega squared | |
| OMEGA_SQ = re.compile( | |
| r"omega[Β²2\s-]*squared?\s*[=:]\s*(0?\.\d+)", | |
| re.IGNORECASE | |
| ) | |
| # Odds ratio | |
| ODDS_R = re.compile( | |
| r"odds\s*ratio\s*[=:]\s*(\d+\.?\d*)", | |
| re.IGNORECASE | |
| ) | |
| # Sample size | |
| N_PAT = re.compile( | |
| r"\bn\s*[=:]\s*(\d+)", | |
| re.IGNORECASE | |
| ) | |
| # Cohen's benchmarks | |
| COHENS_BENCHMARKS = { | |
| "small": 0.2, | |
| "medium": 0.5, | |
| "large": 0.8, | |
| } | |
| def analyze(self, text: str) -> EffectSizeResult: | |
| effects = self._extract_effects(text) | |
| ns = self._extract_ns(text) | |
| n_val = min(ns) if ns else None | |
| flags = [] | |
| inflated = [] | |
| underpowered = [] | |
| power_ests = [] | |
| for etype, evalue in effects: | |
| # ββ Power estimation ββββββββββββββββββββββββββββββββββ | |
| if n_val and etype == "cohens_d": | |
| power = self._estimate_power(evalue, n_val) | |
| power_ests.append({ | |
| "effect_type": etype, | |
| "effect_value": evalue, | |
| "n": n_val, | |
| "power": round(power, 3), | |
| }) | |
| # ββ Flag: inflated effect size βββββββββββββββββββββ | |
| if abs(evalue) > 2.0 and n_val < 30: | |
| inflated.append((etype, evalue, n_val)) | |
| flags.append(EffectSizeFlag( | |
| flag_type = "inflated_effect_size", | |
| severity = "high", | |
| description = ( | |
| f"Cohen's d = {evalue} is extremely large " | |
| f"with only N = {n_val}. Effect sizes above " | |
| f"d = 2.0 with small samples are rarely " | |
| f"genuine β likely reflects noise, " | |
| f"outliers, or fabrication." | |
| ), | |
| evidence = ( | |
| f"Cohen's d = {evalue}, N = {n_val} | " | |
| f"Expected power: {round(power*100)}% | " | |
| f"Cohen's large effect benchmark: d = 0.8" | |
| ), | |
| suggestion = ( | |
| "Report confidence intervals for effect " | |
| "sizes. Conduct sensitivity analysis. " | |
| "Verify no outliers are driving the effect." | |
| ), | |
| )) | |
| # ββ Flag: underpowered study βββββββββββββββββββββββ | |
| elif power < 0.8 and n_val < 50: | |
| underpowered.append((etype, evalue, n_val, power)) | |
| flags.append(EffectSizeFlag( | |
| flag_type = "underpowered_study", | |
| severity = "medium", | |
| description = ( | |
| f"Study is underpowered (estimated power = " | |
| f"{round(power*100)}%). With N = {n_val} and " | |
| f"d = {evalue}, there is only a " | |
| f"{round(power*100)}% chance of detecting " | |
| f"a real effect. Significant results from " | |
| f"underpowered studies are likely false positives." | |
| ), | |
| evidence = ( | |
| f"Cohen's d = {evalue}, N = {n_val} | " | |
| f"Estimated power = {round(power*100)}% " | |
| f"(recommended minimum: 80%)" | |
| ), | |
| suggestion = ( | |
| "Conduct a priori power analysis. " | |
| "Increase sample size to achieve 80% power. " | |
| "Report power analysis in methods section." | |
| ), | |
| )) | |
| # ββ Flag: impossible r value βββββββββββββββββββββββββββ | |
| if etype == "pearson_r" and abs(evalue) > 1.0: | |
| flags.append(EffectSizeFlag( | |
| flag_type = "impossible_correlation", | |
| severity = "high", | |
| description = ( | |
| f"Pearson r = {evalue} is impossible β " | |
| f"correlations must be between -1 and 1. " | |
| f"This indicates a reporting error or fabrication." | |
| ), | |
| evidence = f"r = {evalue} reported", | |
| suggestion = ( | |
| "Verify raw correlation values. " | |
| "Check if rΒ² was mistakenly reported as r." | |
| ), | |
| )) | |
| # ββ Flag: suspiciously large eta squared ββββββββββββββ | |
| if etype == "eta_squared" and evalue > 0.5: | |
| flags.append(EffectSizeFlag( | |
| flag_type = "large_eta_squared", | |
| severity = "medium", | |
| description = ( | |
| f"Eta-squared = {evalue} is unusually large. " | |
| f"Values above 0.5 are rare in behavioral and " | |
| f"social science research and warrant scrutiny." | |
| ), | |
| evidence = f"Ξ·Β² = {evalue} (large effect threshold: 0.14)", | |
| suggestion = ( | |
| "Report partial eta-squared separately. " | |
| "Verify ANOVA calculations and degrees of freedom." | |
| ), | |
| )) | |
| # ββ Flag: no effect sizes reported ββββββββββββββββββββββββ | |
| if len(effects) == 0: | |
| flags.append(EffectSizeFlag( | |
| flag_type = "missing_effect_sizes", | |
| severity = "medium", | |
| description = ( | |
| "No effect sizes reported in the paper. " | |
| "Effect sizes (Cohen's d, r, eta-squared) are " | |
| "essential for interpreting practical significance " | |
| "and are required by most major journals." | |
| ), | |
| evidence = "No Cohen's d, r, or eta-squared found", | |
| suggestion = ( | |
| "Report effect sizes with confidence intervals " | |
| "for all primary outcomes. Use Cohen's d for " | |
| "mean differences, r for correlations." | |
| ), | |
| )) | |
| score = self._aggregate_score(inflated, underpowered, effects) | |
| level = self._risk(score, len(inflated), len(underpowered)) | |
| summary = self._build_summary( | |
| effects, inflated, underpowered, score, level | |
| ) | |
| return EffectSizeResult( | |
| effect_sizes_found = effects, | |
| power_estimates = power_ests, | |
| inflated_effects = inflated, | |
| underpowered = underpowered, | |
| effect_score = round(score, 4), | |
| risk_level = level, | |
| summary = summary, | |
| flags = flags, | |
| flags_count = len(flags), | |
| ) | |
| # ββ internal helpers βββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_effects(self, text: str) -> list: | |
| effects = [] | |
| for m in self.COHENS_D.finditer(text): | |
| try: | |
| effects.append(("cohens_d", float(m.group(1)))) | |
| except ValueError: | |
| pass | |
| for m in self.PEARSON_R.finditer(text): | |
| try: | |
| v = float(m.group(1)) | |
| if -1.5 <= v <= 1.5: | |
| effects.append(("pearson_r", v)) | |
| except ValueError: | |
| pass | |
| for m in self.ETA_SQ.finditer(text): | |
| try: | |
| effects.append(("eta_squared", float(m.group(1)))) | |
| except ValueError: | |
| pass | |
| for m in self.OMEGA_SQ.finditer(text): | |
| try: | |
| effects.append(("omega_squared", float(m.group(1)))) | |
| except ValueError: | |
| pass | |
| for m in self.ODDS_R.finditer(text): | |
| try: | |
| v = float(m.group(1)) | |
| if 0.1 <= v <= 50: | |
| effects.append(("odds_ratio", v)) | |
| except ValueError: | |
| pass | |
| return effects | |
| def _extract_ns(self, text: str) -> list: | |
| ns = [] | |
| for m in self.N_PAT.finditer(text): | |
| try: | |
| v = int(m.group(1)) | |
| if 2 <= v <= 100000: | |
| ns.append(v) | |
| except ValueError: | |
| pass | |
| return ns | |
| def _estimate_power(self, d: float, n: int) -> float: | |
| """ | |
| Approximate statistical power for two-sample t-test. | |
| Uses normal approximation of non-central t distribution. | |
| """ | |
| try: | |
| ncp = abs(d) * math.sqrt(n / 2) | |
| power = 1 - self._normal_cdf(1.96 - ncp) | |
| return min(max(power, 0.0), 1.0) | |
| except Exception: | |
| return 0.5 | |
| def _normal_cdf(self, x: float) -> float: | |
| """Approximation of standard normal CDF.""" | |
| return 0.5 * (1 + math.erf(x / math.sqrt(2))) | |
| def _aggregate_score(self, inflated, underpowered, | |
| effects) -> float: | |
| if not effects: | |
| return 0.3 | |
| score = 0.0 | |
| if inflated: | |
| score += 0.5 * min(len(inflated), 2) / 2 | |
| if underpowered: | |
| score += 0.3 * min(len(underpowered), 2) / 2 | |
| return min(score, 1.0) | |
| def _risk(self, score: float, | |
| n_inflated: int, | |
| n_underpowered: int) -> str: | |
| if n_inflated >= 1 or score >= 0.6: | |
| return "critical" | |
| if n_underpowered >= 2 or score >= 0.4: | |
| return "high" | |
| if n_underpowered >= 1 or score >= 0.2: | |
| return "medium" | |
| return "low" | |
| def _build_summary(self, effects, inflated, | |
| underpowered, score, level) -> str: | |
| if not effects: | |
| return ( | |
| "Effect Size Validation: No effect sizes detected. " | |
| "Cohen's d, r, or eta-squared reporting is recommended " | |
| "for all primary outcomes. Risk level: MEDIUM." | |
| ) | |
| pct = round(score * 100) | |
| return ( | |
| f"Effect Size Validator analyzed {len(effects)} effect " | |
| f"size(s). {len(inflated)} inflated, " | |
| f"{len(underpowered)} underpowered study/studies detected. " | |
| f"Overall risk score: {pct}%. " | |
| f"Risk level: {level.upper()}." | |
| ) |