Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Sleeping

SciPeerAI-API / src /scipeerai /modules /granularity_analyzer.py

Abu-Sameer-66

feat: add Granularity Analyzer - Benford Law + digit preference detector

f2c0631 29 days ago

11.2 kB

	# src/scipeerai/modules/granularity_analyzer.py
	#
	# Statistical Granularity Analyzer
	# Detects: digit preference, too-perfect variance,
	# Benford's Law violations, suspiciously round numbers.
	#
	# Fabricated data tends to look "too clean" —
	# real data has natural messiness. This module
	# catches papers where numbers look manufactured.

	import re
	import math
	import collections
	from dataclasses import dataclass, field


	@dataclass
	class GranularityFlag:
	flag_type: str
	severity: str
	description: str
	evidence: str
	suggestion: str


	@dataclass
	class GranularityResult:
	digit_preference_score: float
	benford_score: float
	round_number_ratio: float
	granularity_score: float
	risk_level: str
	summary: str
	flags: list = field(default_factory=list)
	flags_count: int = 0


	class GranularityAnalyzer:
	"""
	Statistical Granularity Analyzer.
	Real data has natural digit distribution.
	Fabricated data shows digit preference (e.g. too many 0s and 5s)
	and first-digit anomalies (Benford's Law violations).
	"""

	# extract all decimal numbers from text
	NUMBER_PAT = re.compile(r'\b\d+\.\d+\b')
	INTEGER_PAT = re.compile(r'\b\d{2,}\b')

	# Benford's Law expected first-digit distribution
	BENFORD_EXPECTED = {
	1: 0.301, 2: 0.176, 3: 0.125, 4: 0.097,
	5: 0.079, 6: 0.067, 7: 0.058, 8: 0.051, 9: 0.046
	}

	def analyze(self, text: str) -> GranularityResult:
	decimals = [float(x) for x in self.NUMBER_PAT.findall(text)]
	integers = [int(x) for x in self.INTEGER_PAT.findall(text)
	if 10 <= int(x) <= 99999]
	all_nums = decimals + [float(x) for x in integers]

	flags = []

	# ── 1. Digit Preference ───────────────────────────────────
	dp_score, dp_flag = self._digit_preference(decimals)
	if dp_flag:
	flags.append(dp_flag)

	# ── 2. Benford's Law ──────────────────────────────────────
	bf_score, bf_flag = self._benford_check(all_nums)
	if bf_flag:
	flags.append(bf_flag)

	# ── 3. Round Number Ratio ─────────────────────────────────
	rn_ratio, rn_flag = self._round_number_check(decimals)
	if rn_flag:
	flags.append(rn_flag)

	# ── 4. Too-Perfect Variance ───────────────────────────────
	tp_flag = self._too_perfect_check(decimals)
	if tp_flag:
	flags.append(tp_flag)

	# ── Aggregate Score ───────────────────────────────────────
	components = [dp_score, bf_score, rn_ratio]
	score = round(sum(components) / len(components), 4)
	level = self._risk(score, len(flags))
	summary = self._build_summary(score, level, len(flags), len(decimals))

	return GranularityResult(
	digit_preference_score = round(dp_score, 4),
	benford_score = round(bf_score, 4),
	round_number_ratio = round(rn_ratio, 4),
	granularity_score = score,
	risk_level = level,
	summary = summary,
	flags = flags,
	flags_count = len(flags),
	)

	# ── internal helpers ─────────────────────────────────────────

	def _digit_preference(self, numbers: list):
	"""
	Check last digits of decimal numbers.
	Real data: uniform distribution across 0-9.
	Fabricated data: too many 0s and 5s.
	"""
	if len(numbers) < 5:
	return 0.0, None

	last_digits = []
	for n in numbers:
	s = str(n)
	if '.' in s:
	last_digits.append(int(s[-1]))

	if not last_digits:
	return 0.0, None

	counts = collections.Counter(last_digits)
	total = len(last_digits)
	zero_five = (counts.get(0, 0) + counts.get(5, 0)) / total
	expected = 0.2 # 2 out of 10 digits

	score = min((zero_five - expected) / 0.4, 1.0) if zero_five > expected else 0.0
	score = max(score, 0.0)

	if zero_five > 0.45:
	return score, GranularityFlag(
	flag_type = "digit_preference_detected",
	severity = "high" if zero_five > 0.6 else "medium",
	description = (
	f"Unusual digit preference detected. "
	f"{round(zero_five * 100)}% of decimal values end in "
	f"0 or 5 — expected ~20% in real data. "
	f"Suggests manually entered or rounded values."
	),
	evidence = (
	f"Last-digit analysis: {round(zero_five * 100)}% "
	f"end in 0 or 5 (expected: ~20%) \| "
	f"Sample: {last_digits[:10]}"
	),
	suggestion = (
	"Report raw unrounded values. Verify that "
	"data was not manually entered or post-hoc rounded."
	),
	)
	return score, None

	def _benford_check(self, numbers: list):
	"""
	Benford's Law: first digits of naturally occurring
	numbers follow a logarithmic distribution.
	Violations suggest fabrication.
	"""
	valid = [n for n in numbers if n >= 1]
	if len(valid) < 10:
	return 0.0, None

	first_digits = [int(str(abs(n)).replace('.', '')[0])
	for n in valid if str(abs(n)).replace('.', '')[0] != '0']
	if not first_digits:
	return 0.0, None

	counts = collections.Counter(first_digits)
	total = len(first_digits)

	# Chi-square distance from Benford
	chi_sq = 0.0
	for d in range(1, 10):
	observed = counts.get(d, 0) / total
	expected = self.BENFORD_EXPECTED[d]
	chi_sq += ((observed - expected) ** 2) / expected

	# normalize to 0-1
	score = min(chi_sq / 15.0, 1.0)

	if score > 0.4:
	return score, GranularityFlag(
	flag_type = "benford_law_violation",
	severity = "high" if score > 0.7 else "medium",
	description = (
	f"First-digit distribution deviates from Benford's Law. "
	f"Naturally occurring datasets follow a predictable "
	f"logarithmic distribution — deviation suggests "
	f"non-natural or fabricated data."
	),
	evidence = (
	f"Chi-square deviation: {round(chi_sq, 3)} "
	f"(threshold: 6.0) \| "
	f"First digits analyzed: {total}"
	),
	suggestion = (
	"Verify data collection process. Large Benford "
	"violations in financial or count data are a "
	"strong fabrication signal."
	),
	)
	return score, None

	def _round_number_check(self, numbers: list):
	"""
	Too many round numbers (X.0, X.00) suggests
	manual entry or fabrication.
	"""
	if len(numbers) < 5:
	return 0.0, None

	round_count = sum(1 for n in numbers
	if abs(n - round(n)) < 0.001)
	ratio = round_count / len(numbers)

	if ratio > 0.6:
	return ratio, GranularityFlag(
	flag_type = "excessive_round_numbers",
	severity = "medium",
	description = (
	f"{round(ratio * 100)}% of reported decimal values "
	f"are whole numbers (X.0). Real measurement data "
	f"rarely produces this pattern — suggests rounding "
	f"or manual data entry."
	),
	evidence = (
	f"{round_count}/{len(numbers)} values are "
	f"whole numbers ({round(ratio * 100)}%)"
	),
	suggestion = (
	"Report values to appropriate decimal precision. "
	"Avoid post-hoc rounding of raw measurements."
	),
	)
	return ratio, None

	def _too_perfect_check(self, numbers: list):
	"""
	If all reported values have identical decimal precision,
	this is suspicious — real data has natural variation.
	"""
	if len(numbers) < 6:
	return None

	precisions = []
	for n in numbers:
	s = str(n)
	if '.' in s:
	precisions.append(len(s.split('.')[1]))

	if not precisions:
	return None

	unique_precisions = len(set(precisions))
	if unique_precisions == 1 and len(precisions) >= 6:
	p = precisions[0]
	return GranularityFlag(
	flag_type = "uniform_decimal_precision",
	severity = "medium",
	description = (
	f"All {len(precisions)} decimal values reported to "
	f"exactly {p} decimal place(s). Real measurement "
	f"data rarely has perfectly uniform precision — "
	f"suggests post-processing or fabrication."
	),
	evidence = (
	f"All values use exactly {p} decimal place(s) \| "
	f"Count: {len(precisions)}"
	),
	suggestion = (
	"Report values at their natural precision. "
	"Verify that uniform rounding was not applied."
	),
	)
	return None

	def _risk(self, score: float, flag_count: int) -> str:
	if flag_count >= 3 or score >= 0.6:
	return "critical"
	if flag_count == 2 or score >= 0.4:
	return "high"
	if flag_count == 1 or score >= 0.2:
	return "medium"
	return "low"

	def _build_summary(self, score: float, level: str,
	flag_count: int, num_count: int) -> str:
	if num_count < 5:
	return (
	"Granularity Analysis: Insufficient numerical data "
	"for full analysis (minimum 5 decimal values required)."
	)
	pct = round(score * 100)
	return (
	f"Granularity analysis of {num_count} numerical values. "
	f"Anomaly score: {pct}%. "
	f"{flag_count} granularity concern(s) detected. "
	f"Risk level: {level.upper()}."
	)