Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Sleeping

SciPeerAI-API / src /scipeerai /modules /data_fingerprint.py

Abu-Sameer-66

deploy: SciPeerAI v2.2.0

c94f46f 6 days ago

17.7 kB

	# Cross-Paper Data Fingerprinting
	# --------------------------------
	# Independent studies on the same topic will produce
	# similar but never identical numbers.
	# Random sampling variation guarantees this.
	#
	# When two papers report the exact same mean to four
	# decimal places, the same standard deviation, the same
	# sample size, and the same p-value — they are not
	# independent. One copied from the other, or both
	# copied from a shared fabricated source.
	#
	# This module extracts the numerical fingerprint of a
	# single paper: every mean, SD, sample size, percentage,
	# correlation, and p-value it reports.
	#
	# That fingerprint can then be compared against others.
	# But even in isolation, the fingerprint reveals problems:
	# numbers that are suspiciously round, values that are
	# mathematically impossible given each other, and
	# distributions of digits that do not look like real data.
	#
	# A paper's numbers should look like they came from
	# the world. When they look like they came from a
	# spreadsheet cell someone typed by hand — that is a signal.

	import re
	import math
	from dataclasses import dataclass, field
	from collections import Counter


	# ── data structures ────────────────────────────────────────────────────────────

	@dataclass
	class NumericFingerprint:
	means: list
	std_devs: list
	sample_sizes: list
	percentages: list
	correlations: list
	p_values: list
	all_decimals: list


	@dataclass
	class DataFingerprintFlag:
	flag_type: str
	severity: str
	description: str
	evidence: str
	suggestion: str


	@dataclass
	class DataFingerprintResult:
	fingerprint: NumericFingerprint
	total_numbers: int
	round_number_ratio: float
	terminal_digit_bias: float
	impossible_pairs: list
	suspicious_duplicates: list
	fingerprint_score: float
	risk_level: str
	summary: str
	flags: list
	flags_count: int


	# ── main class ────────────────────────────────────────────────────────────────

	class DataFingerprintAnalyzer:
	"""
	Extracts the complete numerical fingerprint of a paper
	and tests it for signs of fabrication or cloning.

	Four detection layers:
	1. Round number bias — fabricated data rounds too cleanly
	2. Terminal digit bias — humans avoid certain ending digits
	3. Impossible value pairs — SD larger than mean for positive scales
	4. Suspicious internal duplicates — same value repeated too often
	"""

	# regex patterns for specific statistical values
	_MEAN_PATTERN = re.compile(
	r'(?:mean\|average\|M)\s[=:]\s(-?\d+\.?\d*)', re.IGNORECASE
	)
	_SD_PATTERN = re.compile(
	r'(?:SD\|S\.D\.\|std\|standard deviation)\s[=:]\s(\d+\.?\d*)',
	re.IGNORECASE
	)
	_N_PATTERN = re.compile(
	r'(?:N\|n\|sample size)\s[=:]\s(\d+)', re.IGNORECASE
	)
	_PCT_PATTERN = re.compile(
	r'(\d+\.?\d)\s%'
	)
	_CORR_PATTERN = re.compile(
	r'(?:r\|correlation)\s[=:]\s(-?\d*\.?\d+)', re.IGNORECASE
	)
	_PVAL_PATTERN = re.compile(
	r'p\s[=<>]\s(0?\.\d+\|\d+\.\d+[eE][+-]?\d+)', re.IGNORECASE
	)

	def analyze(self, text: str) -> DataFingerprintResult:
	fp = self._extract_fingerprint(text)
	flags = []

	round_ratio = self._check_round_number_bias(fp, flags)
	terminal_bias = self._check_terminal_digit_bias(fp, flags)
	impossible = self._check_impossible_pairs(fp, flags)
	duplicates = self._check_suspicious_duplicates(fp, flags)

	total = self._count_total(fp)
	score = self._compute_score(
	round_ratio, terminal_bias, impossible, duplicates, total
	)
	level = self._get_risk_level(score)

	return DataFingerprintResult(
	fingerprint = fp,
	total_numbers = total,
	round_number_ratio = round(round_ratio, 3),
	terminal_digit_bias = round(terminal_bias, 3),
	impossible_pairs = impossible,
	suspicious_duplicates = duplicates,
	fingerprint_score = round(score, 3),
	risk_level = level,
	summary = self._write_summary(flags, level, total),
	flags = flags,
	flags_count = len(flags),
	)

	# ── extraction ─────────────────────────────────────────────────────────────

	def _extract_fingerprint(self, text: str) -> NumericFingerprint:
	means = self._parse_floats(self._MEAN_PATTERN, text)
	std_devs = self._parse_floats(self._SD_PATTERN, text)
	sample_sizes = self._parse_ints( self._N_PATTERN, text)
	percentages = self._parse_floats(self._PCT_PATTERN, text)
	correlations = self._parse_floats(self._CORR_PATTERN, text)
	p_values = self._parse_floats(self._PVAL_PATTERN, text)

	# all decimal numbers in the paper for digit-level analysis
	all_decimals = [
	float(m.group())
	for m in re.finditer(r'-?\d+\.\d+', text)
	if self._safe_float(m.group()) is not None
	]

	return NumericFingerprint(
	means = means,
	std_devs = std_devs,
	sample_sizes = sample_sizes,
	percentages = percentages,
	correlations = correlations,
	p_values = p_values,
	all_decimals = all_decimals,
	)

	def _parse_floats(self, pattern: re.Pattern, text: str) -> list:
	results = []
	for match in pattern.finditer(text):
	val = self._safe_float(match.group(1))
	if val is not None:
	results.append(val)
	return results

	def _parse_ints(self, pattern: re.Pattern, text: str) -> list:
	results = []
	for match in pattern.finditer(text):
	try:
	val = int(match.group(1))
	if 1 <= val <= 1_000_000:
	results.append(val)
	except (ValueError, IndexError):
	pass
	return results

	def _safe_float(self, raw: str) -> float:
	try:
	return float(raw.strip())
	except (ValueError, AttributeError):
	return None

	# ── detection checks ───────────────────────────────────────────────────────

	def _check_round_number_bias(
	self, fp: NumericFingerprint, flags: list
	) -> float:
	"""
	Real data does not round to whole numbers or .5 steps very often.
	When more than 60% of reported values are suspiciously round,
	someone likely typed them rather than computed them.
	"""
	all_vals = fp.means + fp.std_devs + fp.percentages
	if len(all_vals) < 4:
	return 0.0

	round_count = sum(
	1 for v in all_vals
	if v == round(v, 0) or v == round(v, 1) and str(v).endswith(('0', '5'))
	)
	ratio = round_count / len(all_vals)

	if ratio >= 0.60:
	flags.append(DataFingerprintFlag(
	flag_type = "round_number_bias",
	severity = "medium",
	description = (
	f"{round_count}/{len(all_vals)} reported values "
	f"({round(ratio * 100, 1)}%) are suspiciously round. "
	f"Real measured data rarely rounds this cleanly."
	),
	evidence = (
	f"Round values detected among means, SDs, and percentages. "
	f"Round ratio: {round(ratio, 3)}."
	),
	suggestion = (
	"Verify that reported values are directly from analysis "
	"output, not manually entered approximations."
	),
	))

	return ratio

	def _check_terminal_digit_bias(
	self, fp: NumericFingerprint, flags: list
	) -> float:
	"""
	The last digit of a truly random number is uniformly distributed
	across 0-9. Humans fabricating numbers unconsciously prefer
	certain digits (0, 5) and avoid others (7, 9).
	A chi-square test on terminal digits detects this.
	"""
	all_vals = fp.all_decimals + [float(n) for n in fp.sample_sizes]
	if len(all_vals) < 10:
	return 0.0

	terminals = []
	for v in all_vals:
	parts = str(abs(v)).replace('.', '')
	if parts:
	terminals.append(int(parts[-1]))

	if not terminals:
	return 0.0

	counter = Counter(terminals)
	expected = len(terminals) / 10.0
	chi_sq = sum(
	((counter.get(d, 0) - expected) ** 2) / expected
	for d in range(10)
	)

	# chi-square critical value at p=0.05 with 9 df is 16.92
	bias_score = min(chi_sq / 50.0, 1.0)

	if chi_sq >= 16.92:
	dominant_digit = counter.most_common(1)[0]
	flags.append(DataFingerprintFlag(
	flag_type = "terminal_digit_bias",
	severity = "medium",
	description = (
	f"Terminal digit distribution deviates significantly "
	f"from uniform expectation. "
	f"Chi-square statistic: {round(chi_sq, 2)} "
	f"(critical value: 16.92). "
	f"This pattern is consistent with human number fabrication."
	),
	evidence = (
	f"Most frequent terminal digit: "
	f"'{dominant_digit[0]}' appears {dominant_digit[1]} times. "
	f"Expected uniform frequency: {round(expected, 1)} each."
	),
	suggestion = (
	"Re-examine raw data files to confirm reported values "
	"match analysis output. Terminal digit bias is a "
	"well-established fabrication marker."
	),
	))

	return round(bias_score, 3)

	def _check_impossible_pairs(
	self, fp: NumericFingerprint, flags: list
	) -> list:
	"""
	Statistical relationships constrain what values can coexist.
	SD > mean is impossible for strictly positive Likert-scale data.
	Correlation outside [-1, 1] is mathematically impossible.
	P-value outside [0, 1] cannot exist.
	"""
	impossible = []

	# SD > mean for positive scales (Likert 1-7, reaction times, etc.)
	for mean, sd in zip(fp.means, fp.std_devs):
	if mean > 0 and sd > mean * 2:
	pair = f"M={mean}, SD={sd}"
	impossible.append(pair)

	if impossible:
	flags.append(DataFingerprintFlag(
	flag_type = "impossible_sd_mean_pair",
	severity = "high",
	description = (
	f"{len(impossible)} mean/SD pair(s) where the standard "
	f"deviation is implausibly large relative to the mean. "
	f"For bounded positive scales, SD > 2*mean is suspicious."
	),
	evidence = f"Impossible pairs: {impossible[:3]}.",
	suggestion = (
	"Verify these values against the original analysis output. "
	"Large SDs relative to means may indicate data entry error "
	"or scale confusion."
	),
	))

	# correlation outside valid range
	bad_corr = [r for r in fp.correlations if abs(r) > 1.0]
	if bad_corr:
	impossible.extend([f"r={r}" for r in bad_corr])
	flags.append(DataFingerprintFlag(
	flag_type = "impossible_correlation",
	severity = "high",
	description = (
	f"{len(bad_corr)} correlation value(s) outside [-1, 1]. "
	f"These values are mathematically impossible."
	),
	evidence = f"Invalid correlations: {bad_corr}.",
	suggestion = "Correct these values before submission.",
	))

	# p-value outside [0, 1]
	bad_p = [p for p in fp.p_values if p < 0 or p > 1]
	if bad_p:
	impossible.extend([f"p={p}" for p in bad_p])
	flags.append(DataFingerprintFlag(
	flag_type = "impossible_p_value",
	severity = "high",
	description = (
	f"{len(bad_p)} p-value(s) outside [0, 1]. "
	f"These values cannot exist."
	),
	evidence = f"Invalid p-values: {bad_p}.",
	suggestion = "Check analysis code for unit or scale errors.",
	))

	return impossible

	def _check_suspicious_duplicates(
	self, fp: NumericFingerprint, flags: list
	) -> list:
	"""
	The same specific decimal value appearing 3+ times in a paper
	is unusual unless it is a threshold or constant.
	In fabricated data, a single invented number gets reused.
	"""
	all_vals = fp.means + fp.std_devs + fp.percentages + fp.correlations
	if len(all_vals) < 6:
	return []

	counter = Counter(all_vals)
	duplicates = [
	v for v, count in counter.items()
	if count >= 3 and v not in (0.0, 1.0, 0.5, 100.0, 0.05)
	]

	if duplicates:
	flags.append(DataFingerprintFlag(
	flag_type = "suspicious_value_repetition",
	severity = "medium",
	description = (
	f"{len(duplicates)} specific value(s) appear 3 or more "
	f"times across different reported statistics. "
	f"Genuine independent measurements rarely share "
	f"exact decimal values."
	),
	evidence = (
	f"Repeated values: "
	f"{[round(v, 4) for v in duplicates[:5]]}."
	),
	suggestion = (
	"Verify that repeated values reflect genuinely "
	"identical measurements and are not copy-paste artifacts."
	),
	))

	return duplicates

	# ── helpers ────────────────────────────────────────────────────────────────

	def _count_total(self, fp: NumericFingerprint) -> int:
	return (
	len(fp.means) + len(fp.std_devs) + len(fp.sample_sizes) +
	len(fp.percentages) + len(fp.correlations) + len(fp.p_values)
	)

	# ── scoring ────────────────────────────────────────────────────────────────

	def _compute_score(
	self,
	round_ratio: float,
	terminal_bias: float,
	impossible: list,
	duplicates: list,
	total: int,
	) -> float:
	if total == 0:
	return 0.0

	impossible_score = min(len(impossible) * 0.25, 1.0)
	duplicate_score = min(len(duplicates) * 0.15, 1.0)

	score = (
	round_ratio * 0.25 +
	terminal_bias * 0.25 +
	impossible_score * 0.35 +
	duplicate_score * 0.15
	)
	return min(score, 1.0)

	def _get_risk_level(self, score: float) -> str:
	if score >= 0.70: return "critical"
	if score >= 0.45: return "high"
	if score >= 0.25: return "medium"
	return "low"

	def _write_summary(
	self, flags: list, risk_level: str, total: int
	) -> str:
	if total == 0:
	return (
	"Data Fingerprint Analysis: No statistical values extracted. "
	"Include explicit M=, SD=, N=, r=, and p= reporting "
	f"for full analysis. Risk level: {risk_level.upper()}."
	)

	if not flags:
	return (
	f"Data Fingerprint Analysis: {total} statistical value(s) "
	f"analyzed. No fabrication signals detected. "
	f"Numerical patterns appear consistent with genuine data. "
	f"Risk level: {risk_level.upper()}."
	)

	high = sum(1 for f in flags if f.severity == "high")
	medium = sum(1 for f in flags if f.severity == "medium")
	parts = []
	if high:
	parts.append(
	f"{high} impossible value{'s' if high > 1 else ''} detected"
	)
	if medium:
	parts.append(
	f"{medium} fabrication signal{'s' if medium > 1 else ''} found"
	)

	return (
	f"Data Fingerprint Analysis: {total} value(s) analyzed. "
	f"{'; '.join(parts)}. "
	f"Risk level: {risk_level.upper()}."
	)