# src/scipeerai/modules/grim_test.py # # GRIM Test — Granularity-Related Inconsistency of Means # Based on: Brown & Heathers (2017), Social Psychological # and Personality Science — scientifically validated. # # Catches mathematically impossible means given sample size. # Example: mean=2.34 with n=20 is IMPOSSIBLE. import re import math from dataclasses import dataclass, field @dataclass class GrimFlag: flag_type: str severity: str description: str evidence: str suggestion: str @dataclass class GrimResult: impossible_means: list possible_means: list grim_score: float risk_level: str summary: str flags: list = field(default_factory=list) flags_count: int = 0 class GrimTest: """ GRIM Test implementation. Checks whether reported means are mathematically possible given the reported sample size and scale. """ # regex to pull mean/average + sample size pairs MEAN_PATTERN = re.compile( r'(?:mean|average|m)\s*[=:]\s*(-?\d+\.?\d*)', re.IGNORECASE ) N_PATTERN = re.compile( r'n\s*[=:]\s*(\d+)', re.IGNORECASE ) FULL_PATTERN = re.compile( r'(?:mean|average|m)\s*[=:]\s*(-?\d+\.\d+)' r'.{0,80}' r'n\s*[=:]\s*(\d+)' r'|' r'n\s*[=:]\s*(\d+)' r'.{0,80}' r'(?:mean|average|m)\s*[=:]\s*(-?\d+\.\d+)', re.IGNORECASE ) def analyze(self, text: str) -> GrimResult: pairs = self._extract_pairs(text) impossible = [] possible = [] flags = [] for mean_val, n_val in pairs: ok = self._grim_check(mean_val, n_val) if ok: possible.append((mean_val, n_val)) else: impossible.append((mean_val, n_val)) flags.append(GrimFlag( flag_type = "grim_impossible_mean", severity = "high", description = ( f"Mean={mean_val} is mathematically " f"impossible with N={n_val}. " f"This value cannot arise from integer " f"item scores — potential data fabrication." ), evidence = ( f"Reported: M={mean_val}, N={n_val} | " f"Closest valid means: " f"{self._nearest_valid(mean_val, n_val)}" ), suggestion = ( "Re-check raw data and recalculate. " "If using Likert scales, verify item " "scoring and sample size." ), )) total = len(impossible) + len(possible) score = (len(impossible) / total) if total > 0 else 0.0 level = self._risk(score, len(impossible)) summary = self._build_summary( impossible, possible, score, level ) return GrimResult( impossible_means = impossible, possible_means = possible, grim_score = round(score, 4), risk_level = level, summary = summary, flags = flags, flags_count = len(flags), ) # ── internal helpers ───────────────────────────────────────── def _grim_check(self, mean: float, n: int) -> bool: """ Core GRIM logic. A mean is possible iff (mean * n) rounds to an integer. Tolerance: 0.001 to handle floating-point noise. """ product = mean * n remainder = abs(product - round(product)) return remainder < 0.001 def _extract_pairs(self, text: str): pairs = [] for m in self.FULL_PATTERN.finditer(text): if m.group(1) and m.group(2): mean_val = float(m.group(1)) n_val = int(m.group(2)) else: mean_val = float(m.group(4)) n_val = int(m.group(3)) if 2 <= n_val <= 10000: pairs.append((mean_val, n_val)) return pairs def _nearest_valid(self, mean: float, n: int) -> str: decimals = len(str(mean).split(".")[-1]) step = round(1 / n, decimals + 2) lower = math.floor(mean * n) / n upper = math.ceil(mean * n) / n return f"{round(lower, decimals)} or {round(upper, decimals)}" def _risk(self, score: float, count: int) -> str: if count >= 3 or score >= 0.6: return "critical" if count == 2 or score >= 0.4: return "high" if count == 1 or score >= 0.2: return "medium" return "low" def _build_summary(self, impossible, possible, score, level) -> str: total = len(impossible) + len(possible) if total == 0: return ( "GRIM Test: No mean/N pairs detected in text. " "Add explicit M= and N= values for analysis." ) pct = round(score * 100) return ( f"GRIM Test analyzed {total} mean/N pair(s). " f"{len(impossible)} impossible mean(s) detected " f"({pct}% failure rate). " f"Risk level: {level.upper()}." )