Spaces:
Sleeping
Sleeping
| # Cross-Paper Data Fingerprinting | |
| # -------------------------------- | |
| # Independent studies on the same topic will produce | |
| # similar but never identical numbers. | |
| # Random sampling variation guarantees this. | |
| # | |
| # When two papers report the exact same mean to four | |
| # decimal places, the same standard deviation, the same | |
| # sample size, and the same p-value β they are not | |
| # independent. One copied from the other, or both | |
| # copied from a shared fabricated source. | |
| # | |
| # This module extracts the numerical fingerprint of a | |
| # single paper: every mean, SD, sample size, percentage, | |
| # correlation, and p-value it reports. | |
| # | |
| # That fingerprint can then be compared against others. | |
| # But even in isolation, the fingerprint reveals problems: | |
| # numbers that are suspiciously round, values that are | |
| # mathematically impossible given each other, and | |
| # distributions of digits that do not look like real data. | |
| # | |
| # A paper's numbers should look like they came from | |
| # the world. When they look like they came from a | |
| # spreadsheet cell someone typed by hand β that is a signal. | |
| import re | |
| import math | |
| from dataclasses import dataclass, field | |
| from collections import Counter | |
| # ββ data structures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class NumericFingerprint: | |
| means: list | |
| std_devs: list | |
| sample_sizes: list | |
| percentages: list | |
| correlations: list | |
| p_values: list | |
| all_decimals: list | |
| class DataFingerprintFlag: | |
| flag_type: str | |
| severity: str | |
| description: str | |
| evidence: str | |
| suggestion: str | |
| class DataFingerprintResult: | |
| fingerprint: NumericFingerprint | |
| total_numbers: int | |
| round_number_ratio: float | |
| terminal_digit_bias: float | |
| impossible_pairs: list | |
| suspicious_duplicates: list | |
| fingerprint_score: float | |
| risk_level: str | |
| summary: str | |
| flags: list | |
| flags_count: int | |
| # ββ main class ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class DataFingerprintAnalyzer: | |
| """ | |
| Extracts the complete numerical fingerprint of a paper | |
| and tests it for signs of fabrication or cloning. | |
| Four detection layers: | |
| 1. Round number bias β fabricated data rounds too cleanly | |
| 2. Terminal digit bias β humans avoid certain ending digits | |
| 3. Impossible value pairs β SD larger than mean for positive scales | |
| 4. Suspicious internal duplicates β same value repeated too often | |
| """ | |
| # regex patterns for specific statistical values | |
| _MEAN_PATTERN = re.compile( | |
| r'(?:mean|average|M)\s*[=:]\s*(-?\d+\.?\d*)', re.IGNORECASE | |
| ) | |
| _SD_PATTERN = re.compile( | |
| r'(?:SD|S\.D\.|std|standard deviation)\s*[=:]\s*(\d+\.?\d*)', | |
| re.IGNORECASE | |
| ) | |
| _N_PATTERN = re.compile( | |
| r'(?:N|n|sample size)\s*[=:]\s*(\d+)', re.IGNORECASE | |
| ) | |
| _PCT_PATTERN = re.compile( | |
| r'(\d+\.?\d*)\s*%' | |
| ) | |
| _CORR_PATTERN = re.compile( | |
| r'(?:r|correlation)\s*[=:]\s*(-?\d*\.?\d+)', re.IGNORECASE | |
| ) | |
| _PVAL_PATTERN = re.compile( | |
| r'p\s*[=<>]\s*(0?\.\d+|\d+\.\d+[eE][+-]?\d+)', re.IGNORECASE | |
| ) | |
| def analyze(self, text: str) -> DataFingerprintResult: | |
| fp = self._extract_fingerprint(text) | |
| flags = [] | |
| round_ratio = self._check_round_number_bias(fp, flags) | |
| terminal_bias = self._check_terminal_digit_bias(fp, flags) | |
| impossible = self._check_impossible_pairs(fp, flags) | |
| duplicates = self._check_suspicious_duplicates(fp, flags) | |
| total = self._count_total(fp) | |
| score = self._compute_score( | |
| round_ratio, terminal_bias, impossible, duplicates, total | |
| ) | |
| level = self._get_risk_level(score) | |
| return DataFingerprintResult( | |
| fingerprint = fp, | |
| total_numbers = total, | |
| round_number_ratio = round(round_ratio, 3), | |
| terminal_digit_bias = round(terminal_bias, 3), | |
| impossible_pairs = impossible, | |
| suspicious_duplicates = duplicates, | |
| fingerprint_score = round(score, 3), | |
| risk_level = level, | |
| summary = self._write_summary(flags, level, total), | |
| flags = flags, | |
| flags_count = len(flags), | |
| ) | |
| # ββ extraction βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_fingerprint(self, text: str) -> NumericFingerprint: | |
| means = self._parse_floats(self._MEAN_PATTERN, text) | |
| std_devs = self._parse_floats(self._SD_PATTERN, text) | |
| sample_sizes = self._parse_ints( self._N_PATTERN, text) | |
| percentages = self._parse_floats(self._PCT_PATTERN, text) | |
| correlations = self._parse_floats(self._CORR_PATTERN, text) | |
| p_values = self._parse_floats(self._PVAL_PATTERN, text) | |
| # all decimal numbers in the paper for digit-level analysis | |
| all_decimals = [ | |
| float(m.group()) | |
| for m in re.finditer(r'-?\d+\.\d+', text) | |
| if self._safe_float(m.group()) is not None | |
| ] | |
| return NumericFingerprint( | |
| means = means, | |
| std_devs = std_devs, | |
| sample_sizes = sample_sizes, | |
| percentages = percentages, | |
| correlations = correlations, | |
| p_values = p_values, | |
| all_decimals = all_decimals, | |
| ) | |
| def _parse_floats(self, pattern: re.Pattern, text: str) -> list: | |
| results = [] | |
| for match in pattern.finditer(text): | |
| val = self._safe_float(match.group(1)) | |
| if val is not None: | |
| results.append(val) | |
| return results | |
| def _parse_ints(self, pattern: re.Pattern, text: str) -> list: | |
| results = [] | |
| for match in pattern.finditer(text): | |
| try: | |
| val = int(match.group(1)) | |
| if 1 <= val <= 1_000_000: | |
| results.append(val) | |
| except (ValueError, IndexError): | |
| pass | |
| return results | |
| def _safe_float(self, raw: str) -> float: | |
| try: | |
| return float(raw.strip()) | |
| except (ValueError, AttributeError): | |
| return None | |
| # ββ detection checks βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _check_round_number_bias( | |
| self, fp: NumericFingerprint, flags: list | |
| ) -> float: | |
| """ | |
| Real data does not round to whole numbers or .5 steps very often. | |
| When more than 60% of reported values are suspiciously round, | |
| someone likely typed them rather than computed them. | |
| """ | |
| all_vals = fp.means + fp.std_devs + fp.percentages | |
| if len(all_vals) < 4: | |
| return 0.0 | |
| round_count = sum( | |
| 1 for v in all_vals | |
| if v == round(v, 0) or v == round(v, 1) and str(v).endswith(('0', '5')) | |
| ) | |
| ratio = round_count / len(all_vals) | |
| if ratio >= 0.60: | |
| flags.append(DataFingerprintFlag( | |
| flag_type = "round_number_bias", | |
| severity = "medium", | |
| description = ( | |
| f"{round_count}/{len(all_vals)} reported values " | |
| f"({round(ratio * 100, 1)}%) are suspiciously round. " | |
| f"Real measured data rarely rounds this cleanly." | |
| ), | |
| evidence = ( | |
| f"Round values detected among means, SDs, and percentages. " | |
| f"Round ratio: {round(ratio, 3)}." | |
| ), | |
| suggestion = ( | |
| "Verify that reported values are directly from analysis " | |
| "output, not manually entered approximations." | |
| ), | |
| )) | |
| return ratio | |
| def _check_terminal_digit_bias( | |
| self, fp: NumericFingerprint, flags: list | |
| ) -> float: | |
| """ | |
| The last digit of a truly random number is uniformly distributed | |
| across 0-9. Humans fabricating numbers unconsciously prefer | |
| certain digits (0, 5) and avoid others (7, 9). | |
| A chi-square test on terminal digits detects this. | |
| """ | |
| all_vals = fp.all_decimals + [float(n) for n in fp.sample_sizes] | |
| if len(all_vals) < 10: | |
| return 0.0 | |
| terminals = [] | |
| for v in all_vals: | |
| parts = str(abs(v)).replace('.', '') | |
| if parts: | |
| terminals.append(int(parts[-1])) | |
| if not terminals: | |
| return 0.0 | |
| counter = Counter(terminals) | |
| expected = len(terminals) / 10.0 | |
| chi_sq = sum( | |
| ((counter.get(d, 0) - expected) ** 2) / expected | |
| for d in range(10) | |
| ) | |
| # chi-square critical value at p=0.05 with 9 df is 16.92 | |
| bias_score = min(chi_sq / 50.0, 1.0) | |
| if chi_sq >= 16.92: | |
| dominant_digit = counter.most_common(1)[0] | |
| flags.append(DataFingerprintFlag( | |
| flag_type = "terminal_digit_bias", | |
| severity = "medium", | |
| description = ( | |
| f"Terminal digit distribution deviates significantly " | |
| f"from uniform expectation. " | |
| f"Chi-square statistic: {round(chi_sq, 2)} " | |
| f"(critical value: 16.92). " | |
| f"This pattern is consistent with human number fabrication." | |
| ), | |
| evidence = ( | |
| f"Most frequent terminal digit: " | |
| f"'{dominant_digit[0]}' appears {dominant_digit[1]} times. " | |
| f"Expected uniform frequency: {round(expected, 1)} each." | |
| ), | |
| suggestion = ( | |
| "Re-examine raw data files to confirm reported values " | |
| "match analysis output. Terminal digit bias is a " | |
| "well-established fabrication marker." | |
| ), | |
| )) | |
| return round(bias_score, 3) | |
| def _check_impossible_pairs( | |
| self, fp: NumericFingerprint, flags: list | |
| ) -> list: | |
| """ | |
| Statistical relationships constrain what values can coexist. | |
| SD > mean is impossible for strictly positive Likert-scale data. | |
| Correlation outside [-1, 1] is mathematically impossible. | |
| P-value outside [0, 1] cannot exist. | |
| """ | |
| impossible = [] | |
| # SD > mean for positive scales (Likert 1-7, reaction times, etc.) | |
| for mean, sd in zip(fp.means, fp.std_devs): | |
| if mean > 0 and sd > mean * 2: | |
| pair = f"M={mean}, SD={sd}" | |
| impossible.append(pair) | |
| if impossible: | |
| flags.append(DataFingerprintFlag( | |
| flag_type = "impossible_sd_mean_pair", | |
| severity = "high", | |
| description = ( | |
| f"{len(impossible)} mean/SD pair(s) where the standard " | |
| f"deviation is implausibly large relative to the mean. " | |
| f"For bounded positive scales, SD > 2*mean is suspicious." | |
| ), | |
| evidence = f"Impossible pairs: {impossible[:3]}.", | |
| suggestion = ( | |
| "Verify these values against the original analysis output. " | |
| "Large SDs relative to means may indicate data entry error " | |
| "or scale confusion." | |
| ), | |
| )) | |
| # correlation outside valid range | |
| bad_corr = [r for r in fp.correlations if abs(r) > 1.0] | |
| if bad_corr: | |
| impossible.extend([f"r={r}" for r in bad_corr]) | |
| flags.append(DataFingerprintFlag( | |
| flag_type = "impossible_correlation", | |
| severity = "high", | |
| description = ( | |
| f"{len(bad_corr)} correlation value(s) outside [-1, 1]. " | |
| f"These values are mathematically impossible." | |
| ), | |
| evidence = f"Invalid correlations: {bad_corr}.", | |
| suggestion = "Correct these values before submission.", | |
| )) | |
| # p-value outside [0, 1] | |
| bad_p = [p for p in fp.p_values if p < 0 or p > 1] | |
| if bad_p: | |
| impossible.extend([f"p={p}" for p in bad_p]) | |
| flags.append(DataFingerprintFlag( | |
| flag_type = "impossible_p_value", | |
| severity = "high", | |
| description = ( | |
| f"{len(bad_p)} p-value(s) outside [0, 1]. " | |
| f"These values cannot exist." | |
| ), | |
| evidence = f"Invalid p-values: {bad_p}.", | |
| suggestion = "Check analysis code for unit or scale errors.", | |
| )) | |
| return impossible | |
| def _check_suspicious_duplicates( | |
| self, fp: NumericFingerprint, flags: list | |
| ) -> list: | |
| """ | |
| The same specific decimal value appearing 3+ times in a paper | |
| is unusual unless it is a threshold or constant. | |
| In fabricated data, a single invented number gets reused. | |
| """ | |
| all_vals = fp.means + fp.std_devs + fp.percentages + fp.correlations | |
| if len(all_vals) < 6: | |
| return [] | |
| counter = Counter(all_vals) | |
| duplicates = [ | |
| v for v, count in counter.items() | |
| if count >= 3 and v not in (0.0, 1.0, 0.5, 100.0, 0.05) | |
| ] | |
| if duplicates: | |
| flags.append(DataFingerprintFlag( | |
| flag_type = "suspicious_value_repetition", | |
| severity = "medium", | |
| description = ( | |
| f"{len(duplicates)} specific value(s) appear 3 or more " | |
| f"times across different reported statistics. " | |
| f"Genuine independent measurements rarely share " | |
| f"exact decimal values." | |
| ), | |
| evidence = ( | |
| f"Repeated values: " | |
| f"{[round(v, 4) for v in duplicates[:5]]}." | |
| ), | |
| suggestion = ( | |
| "Verify that repeated values reflect genuinely " | |
| "identical measurements and are not copy-paste artifacts." | |
| ), | |
| )) | |
| return duplicates | |
| # ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _count_total(self, fp: NumericFingerprint) -> int: | |
| return ( | |
| len(fp.means) + len(fp.std_devs) + len(fp.sample_sizes) + | |
| len(fp.percentages) + len(fp.correlations) + len(fp.p_values) | |
| ) | |
| # ββ scoring ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _compute_score( | |
| self, | |
| round_ratio: float, | |
| terminal_bias: float, | |
| impossible: list, | |
| duplicates: list, | |
| total: int, | |
| ) -> float: | |
| if total == 0: | |
| return 0.0 | |
| impossible_score = min(len(impossible) * 0.25, 1.0) | |
| duplicate_score = min(len(duplicates) * 0.15, 1.0) | |
| score = ( | |
| round_ratio * 0.25 + | |
| terminal_bias * 0.25 + | |
| impossible_score * 0.35 + | |
| duplicate_score * 0.15 | |
| ) | |
| return min(score, 1.0) | |
| def _get_risk_level(self, score: float) -> str: | |
| if score >= 0.70: return "critical" | |
| if score >= 0.45: return "high" | |
| if score >= 0.25: return "medium" | |
| return "low" | |
| def _write_summary( | |
| self, flags: list, risk_level: str, total: int | |
| ) -> str: | |
| if total == 0: | |
| return ( | |
| "Data Fingerprint Analysis: No statistical values extracted. " | |
| "Include explicit M=, SD=, N=, r=, and p= reporting " | |
| f"for full analysis. Risk level: {risk_level.upper()}." | |
| ) | |
| if not flags: | |
| return ( | |
| f"Data Fingerprint Analysis: {total} statistical value(s) " | |
| f"analyzed. No fabrication signals detected. " | |
| f"Numerical patterns appear consistent with genuine data. " | |
| f"Risk level: {risk_level.upper()}." | |
| ) | |
| high = sum(1 for f in flags if f.severity == "high") | |
| medium = sum(1 for f in flags if f.severity == "medium") | |
| parts = [] | |
| if high: | |
| parts.append( | |
| f"{high} impossible value{'s' if high > 1 else ''} detected" | |
| ) | |
| if medium: | |
| parts.append( | |
| f"{medium} fabrication signal{'s' if medium > 1 else ''} found" | |
| ) | |
| return ( | |
| f"Data Fingerprint Analysis: {total} value(s) analyzed. " | |
| f"{'; '.join(parts)}. " | |
| f"Risk level: {risk_level.upper()}." | |
| ) |