# src/scipeerai/modules/granularity_analyzer.py # # Statistical Granularity Analyzer # Detects: digit preference, too-perfect variance, # Benford's Law violations, suspiciously round numbers. # # Fabricated data tends to look "too clean" — # real data has natural messiness. This module # catches papers where numbers look manufactured. import re import math import collections from dataclasses import dataclass, field @dataclass class GranularityFlag: flag_type: str severity: str description: str evidence: str suggestion: str @dataclass class GranularityResult: digit_preference_score: float benford_score: float round_number_ratio: float granularity_score: float risk_level: str summary: str flags: list = field(default_factory=list) flags_count: int = 0 class GranularityAnalyzer: """ Statistical Granularity Analyzer. Real data has natural digit distribution. Fabricated data shows digit preference (e.g. too many 0s and 5s) and first-digit anomalies (Benford's Law violations). """ # extract all decimal numbers from text NUMBER_PAT = re.compile(r'\b\d+\.\d+\b') INTEGER_PAT = re.compile(r'\b\d{2,}\b') # Benford's Law expected first-digit distribution BENFORD_EXPECTED = { 1: 0.301, 2: 0.176, 3: 0.125, 4: 0.097, 5: 0.079, 6: 0.067, 7: 0.058, 8: 0.051, 9: 0.046 } def analyze(self, text: str) -> GranularityResult: decimals = [float(x) for x in self.NUMBER_PAT.findall(text)] integers = [int(x) for x in self.INTEGER_PAT.findall(text) if 10 <= int(x) <= 99999] all_nums = decimals + [float(x) for x in integers] flags = [] # ── 1. Digit Preference ─────────────────────────────────── dp_score, dp_flag = self._digit_preference(decimals) if dp_flag: flags.append(dp_flag) # ── 2. Benford's Law ────────────────────────────────────── bf_score, bf_flag = self._benford_check(all_nums) if bf_flag: flags.append(bf_flag) # ── 3. Round Number Ratio ───────────────────────────────── rn_ratio, rn_flag = self._round_number_check(decimals) if rn_flag: flags.append(rn_flag) # ── 4. Too-Perfect Variance ─────────────────────────────── tp_flag = self._too_perfect_check(decimals) if tp_flag: flags.append(tp_flag) # ── Aggregate Score ─────────────────────────────────────── components = [dp_score, bf_score, rn_ratio] score = round(sum(components) / len(components), 4) level = self._risk(score, len(flags)) summary = self._build_summary(score, level, len(flags), len(decimals)) return GranularityResult( digit_preference_score = round(dp_score, 4), benford_score = round(bf_score, 4), round_number_ratio = round(rn_ratio, 4), granularity_score = score, risk_level = level, summary = summary, flags = flags, flags_count = len(flags), ) # ── internal helpers ───────────────────────────────────────── def _digit_preference(self, numbers: list): """ Check last digits of decimal numbers. Real data: uniform distribution across 0-9. Fabricated data: too many 0s and 5s. """ if len(numbers) < 5: return 0.0, None last_digits = [] for n in numbers: s = str(n) if '.' in s: last_digits.append(int(s[-1])) if not last_digits: return 0.0, None counts = collections.Counter(last_digits) total = len(last_digits) zero_five = (counts.get(0, 0) + counts.get(5, 0)) / total expected = 0.2 # 2 out of 10 digits score = min((zero_five - expected) / 0.4, 1.0) if zero_five > expected else 0.0 score = max(score, 0.0) if zero_five > 0.45: return score, GranularityFlag( flag_type = "digit_preference_detected", severity = "high" if zero_five > 0.6 else "medium", description = ( f"Unusual digit preference detected. " f"{round(zero_five * 100)}% of decimal values end in " f"0 or 5 — expected ~20% in real data. " f"Suggests manually entered or rounded values." ), evidence = ( f"Last-digit analysis: {round(zero_five * 100)}% " f"end in 0 or 5 (expected: ~20%) | " f"Sample: {last_digits[:10]}" ), suggestion = ( "Report raw unrounded values. Verify that " "data was not manually entered or post-hoc rounded." ), ) return score, None def _benford_check(self, numbers: list): """ Benford's Law: first digits of naturally occurring numbers follow a logarithmic distribution. Violations suggest fabrication. """ valid = [n for n in numbers if n >= 1] if len(valid) < 10: return 0.0, None first_digits = [int(str(abs(n)).replace('.', '')[0]) for n in valid if str(abs(n)).replace('.', '')[0] != '0'] if not first_digits: return 0.0, None counts = collections.Counter(first_digits) total = len(first_digits) # Chi-square distance from Benford chi_sq = 0.0 for d in range(1, 10): observed = counts.get(d, 0) / total expected = self.BENFORD_EXPECTED[d] chi_sq += ((observed - expected) ** 2) / expected # normalize to 0-1 score = min(chi_sq / 15.0, 1.0) if score > 0.4: return score, GranularityFlag( flag_type = "benford_law_violation", severity = "high" if score > 0.7 else "medium", description = ( f"First-digit distribution deviates from Benford's Law. " f"Naturally occurring datasets follow a predictable " f"logarithmic distribution — deviation suggests " f"non-natural or fabricated data." ), evidence = ( f"Chi-square deviation: {round(chi_sq, 3)} " f"(threshold: 6.0) | " f"First digits analyzed: {total}" ), suggestion = ( "Verify data collection process. Large Benford " "violations in financial or count data are a " "strong fabrication signal." ), ) return score, None def _round_number_check(self, numbers: list): """ Too many round numbers (X.0, X.00) suggests manual entry or fabrication. """ if len(numbers) < 5: return 0.0, None round_count = sum(1 for n in numbers if abs(n - round(n)) < 0.001) ratio = round_count / len(numbers) if ratio > 0.6: return ratio, GranularityFlag( flag_type = "excessive_round_numbers", severity = "medium", description = ( f"{round(ratio * 100)}% of reported decimal values " f"are whole numbers (X.0). Real measurement data " f"rarely produces this pattern — suggests rounding " f"or manual data entry." ), evidence = ( f"{round_count}/{len(numbers)} values are " f"whole numbers ({round(ratio * 100)}%)" ), suggestion = ( "Report values to appropriate decimal precision. " "Avoid post-hoc rounding of raw measurements." ), ) return ratio, None def _too_perfect_check(self, numbers: list): """ If all reported values have identical decimal precision, this is suspicious — real data has natural variation. """ if len(numbers) < 6: return None precisions = [] for n in numbers: s = str(n) if '.' in s: precisions.append(len(s.split('.')[1])) if not precisions: return None unique_precisions = len(set(precisions)) if unique_precisions == 1 and len(precisions) >= 6: p = precisions[0] return GranularityFlag( flag_type = "uniform_decimal_precision", severity = "medium", description = ( f"All {len(precisions)} decimal values reported to " f"exactly {p} decimal place(s). Real measurement " f"data rarely has perfectly uniform precision — " f"suggests post-processing or fabrication." ), evidence = ( f"All values use exactly {p} decimal place(s) | " f"Count: {len(precisions)}" ), suggestion = ( "Report values at their natural precision. " "Verify that uniform rounding was not applied." ), ) return None def _risk(self, score: float, flag_count: int) -> str: if flag_count >= 3 or score >= 0.6: return "critical" if flag_count == 2 or score >= 0.4: return "high" if flag_count == 1 or score >= 0.2: return "medium" return "low" def _build_summary(self, score: float, level: str, flag_count: int, num_count: int) -> str: if num_count < 5: return ( "Granularity Analysis: Insufficient numerical data " "for full analysis (minimum 5 decimal values required)." ) pct = round(score * 100) return ( f"Granularity analysis of {num_count} numerical values. " f"Anomaly score: {pct}%. " f"{flag_count} granularity concern(s) detected. " f"Risk level: {level.upper()}." )