File size: 17,701 Bytes
a53c25d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
# Cross-Paper Data Fingerprinting
# --------------------------------
# Independent studies on the same topic will produce
# similar but never identical numbers.
# Random sampling variation guarantees this.
#
# When two papers report the exact same mean to four
# decimal places, the same standard deviation, the same
# sample size, and the same p-value β€” they are not
# independent. One copied from the other, or both
# copied from a shared fabricated source.
#
# This module extracts the numerical fingerprint of a
# single paper: every mean, SD, sample size, percentage,
# correlation, and p-value it reports.
#
# That fingerprint can then be compared against others.
# But even in isolation, the fingerprint reveals problems:
# numbers that are suspiciously round, values that are
# mathematically impossible given each other, and
# distributions of digits that do not look like real data.
#
# A paper's numbers should look like they came from
# the world. When they look like they came from a
# spreadsheet cell someone typed by hand β€” that is a signal.

import re
import math
from dataclasses import dataclass, field
from collections import Counter


# ── data structures ────────────────────────────────────────────────────────────

@dataclass
class NumericFingerprint:
    means:        list
    std_devs:     list
    sample_sizes: list
    percentages:  list
    correlations: list
    p_values:     list
    all_decimals: list


@dataclass
class DataFingerprintFlag:
    flag_type:   str
    severity:    str
    description: str
    evidence:    str
    suggestion:  str


@dataclass
class DataFingerprintResult:
    fingerprint:            NumericFingerprint
    total_numbers:          int
    round_number_ratio:     float
    terminal_digit_bias:    float
    impossible_pairs:       list
    suspicious_duplicates:  list
    fingerprint_score:      float
    risk_level:             str
    summary:                str
    flags:                  list
    flags_count:            int


# ── main class ────────────────────────────────────────────────────────────────

class DataFingerprintAnalyzer:
    """
    Extracts the complete numerical fingerprint of a paper
    and tests it for signs of fabrication or cloning.

    Four detection layers:
    1. Round number bias β€” fabricated data rounds too cleanly
    2. Terminal digit bias β€” humans avoid certain ending digits
    3. Impossible value pairs β€” SD larger than mean for positive scales
    4. Suspicious internal duplicates β€” same value repeated too often
    """

    # regex patterns for specific statistical values
    _MEAN_PATTERN   = re.compile(
        r'(?:mean|average|M)\s*[=:]\s*(-?\d+\.?\d*)', re.IGNORECASE
    )
    _SD_PATTERN     = re.compile(
        r'(?:SD|S\.D\.|std|standard deviation)\s*[=:]\s*(\d+\.?\d*)',
        re.IGNORECASE
    )
    _N_PATTERN      = re.compile(
        r'(?:N|n|sample size)\s*[=:]\s*(\d+)', re.IGNORECASE
    )
    _PCT_PATTERN    = re.compile(
        r'(\d+\.?\d*)\s*%'
    )
    _CORR_PATTERN   = re.compile(
        r'(?:r|correlation)\s*[=:]\s*(-?\d*\.?\d+)', re.IGNORECASE
    )
    _PVAL_PATTERN   = re.compile(
        r'p\s*[=<>]\s*(0?\.\d+|\d+\.\d+[eE][+-]?\d+)', re.IGNORECASE
    )

    def analyze(self, text: str) -> DataFingerprintResult:
        fp    = self._extract_fingerprint(text)
        flags = []

        round_ratio   = self._check_round_number_bias(fp, flags)
        terminal_bias = self._check_terminal_digit_bias(fp, flags)
        impossible    = self._check_impossible_pairs(fp, flags)
        duplicates    = self._check_suspicious_duplicates(fp, flags)

        total   = self._count_total(fp)
        score   = self._compute_score(
            round_ratio, terminal_bias, impossible, duplicates, total
        )
        level   = self._get_risk_level(score)

        return DataFingerprintResult(
            fingerprint           = fp,
            total_numbers         = total,
            round_number_ratio    = round(round_ratio,   3),
            terminal_digit_bias   = round(terminal_bias, 3),
            impossible_pairs      = impossible,
            suspicious_duplicates = duplicates,
            fingerprint_score     = round(score, 3),
            risk_level            = level,
            summary               = self._write_summary(flags, level, total),
            flags                 = flags,
            flags_count           = len(flags),
        )

    # ── extraction ─────────────────────────────────────────────────────────────

    def _extract_fingerprint(self, text: str) -> NumericFingerprint:
        means        = self._parse_floats(self._MEAN_PATTERN,  text)
        std_devs     = self._parse_floats(self._SD_PATTERN,    text)
        sample_sizes = self._parse_ints(  self._N_PATTERN,     text)
        percentages  = self._parse_floats(self._PCT_PATTERN,   text)
        correlations = self._parse_floats(self._CORR_PATTERN,  text)
        p_values     = self._parse_floats(self._PVAL_PATTERN,  text)

        # all decimal numbers in the paper for digit-level analysis
        all_decimals = [
            float(m.group())
            for m in re.finditer(r'-?\d+\.\d+', text)
            if self._safe_float(m.group()) is not None
        ]

        return NumericFingerprint(
            means        = means,
            std_devs     = std_devs,
            sample_sizes = sample_sizes,
            percentages  = percentages,
            correlations = correlations,
            p_values     = p_values,
            all_decimals = all_decimals,
        )

    def _parse_floats(self, pattern: re.Pattern, text: str) -> list:
        results = []
        for match in pattern.finditer(text):
            val = self._safe_float(match.group(1))
            if val is not None:
                results.append(val)
        return results

    def _parse_ints(self, pattern: re.Pattern, text: str) -> list:
        results = []
        for match in pattern.finditer(text):
            try:
                val = int(match.group(1))
                if 1 <= val <= 1_000_000:
                    results.append(val)
            except (ValueError, IndexError):
                pass
        return results

    def _safe_float(self, raw: str) -> float:
        try:
            return float(raw.strip())
        except (ValueError, AttributeError):
            return None

    # ── detection checks ───────────────────────────────────────────────────────

    def _check_round_number_bias(
        self, fp: NumericFingerprint, flags: list
    ) -> float:
        """
        Real data does not round to whole numbers or .5 steps very often.
        When more than 60% of reported values are suspiciously round,
        someone likely typed them rather than computed them.
        """
        all_vals = fp.means + fp.std_devs + fp.percentages
        if len(all_vals) < 4:
            return 0.0

        round_count = sum(
            1 for v in all_vals
            if v == round(v, 0) or v == round(v, 1) and str(v).endswith(('0', '5'))
        )
        ratio = round_count / len(all_vals)

        if ratio >= 0.60:
            flags.append(DataFingerprintFlag(
                flag_type   = "round_number_bias",
                severity    = "medium",
                description = (
                    f"{round_count}/{len(all_vals)} reported values "
                    f"({round(ratio * 100, 1)}%) are suspiciously round. "
                    f"Real measured data rarely rounds this cleanly."
                ),
                evidence    = (
                    f"Round values detected among means, SDs, and percentages. "
                    f"Round ratio: {round(ratio, 3)}."
                ),
                suggestion  = (
                    "Verify that reported values are directly from analysis "
                    "output, not manually entered approximations."
                ),
            ))

        return ratio

    def _check_terminal_digit_bias(
        self, fp: NumericFingerprint, flags: list
    ) -> float:
        """
        The last digit of a truly random number is uniformly distributed
        across 0-9. Humans fabricating numbers unconsciously prefer
        certain digits (0, 5) and avoid others (7, 9).
        A chi-square test on terminal digits detects this.
        """
        all_vals = fp.all_decimals + [float(n) for n in fp.sample_sizes]
        if len(all_vals) < 10:
            return 0.0

        terminals = []
        for v in all_vals:
            parts = str(abs(v)).replace('.', '')
            if parts:
                terminals.append(int(parts[-1]))

        if not terminals:
            return 0.0

        counter  = Counter(terminals)
        expected = len(terminals) / 10.0
        chi_sq   = sum(
            ((counter.get(d, 0) - expected) ** 2) / expected
            for d in range(10)
        )

        # chi-square critical value at p=0.05 with 9 df is 16.92
        bias_score = min(chi_sq / 50.0, 1.0)

        if chi_sq >= 16.92:
            dominant_digit = counter.most_common(1)[0]
            flags.append(DataFingerprintFlag(
                flag_type   = "terminal_digit_bias",
                severity    = "medium",
                description = (
                    f"Terminal digit distribution deviates significantly "
                    f"from uniform expectation. "
                    f"Chi-square statistic: {round(chi_sq, 2)} "
                    f"(critical value: 16.92). "
                    f"This pattern is consistent with human number fabrication."
                ),
                evidence    = (
                    f"Most frequent terminal digit: "
                    f"'{dominant_digit[0]}' appears {dominant_digit[1]} times. "
                    f"Expected uniform frequency: {round(expected, 1)} each."
                ),
                suggestion  = (
                    "Re-examine raw data files to confirm reported values "
                    "match analysis output. Terminal digit bias is a "
                    "well-established fabrication marker."
                ),
            ))

        return round(bias_score, 3)

    def _check_impossible_pairs(
        self, fp: NumericFingerprint, flags: list
    ) -> list:
        """
        Statistical relationships constrain what values can coexist.
        SD > mean is impossible for strictly positive Likert-scale data.
        Correlation outside [-1, 1] is mathematically impossible.
        P-value outside [0, 1] cannot exist.
        """
        impossible = []

        # SD > mean for positive scales (Likert 1-7, reaction times, etc.)
        for mean, sd in zip(fp.means, fp.std_devs):
            if mean > 0 and sd > mean * 2:
                pair = f"M={mean}, SD={sd}"
                impossible.append(pair)

        if impossible:
            flags.append(DataFingerprintFlag(
                flag_type   = "impossible_sd_mean_pair",
                severity    = "high",
                description = (
                    f"{len(impossible)} mean/SD pair(s) where the standard "
                    f"deviation is implausibly large relative to the mean. "
                    f"For bounded positive scales, SD > 2*mean is suspicious."
                ),
                evidence    = f"Impossible pairs: {impossible[:3]}.",
                suggestion  = (
                    "Verify these values against the original analysis output. "
                    "Large SDs relative to means may indicate data entry error "
                    "or scale confusion."
                ),
            ))

        # correlation outside valid range
        bad_corr = [r for r in fp.correlations if abs(r) > 1.0]
        if bad_corr:
            impossible.extend([f"r={r}" for r in bad_corr])
            flags.append(DataFingerprintFlag(
                flag_type   = "impossible_correlation",
                severity    = "high",
                description = (
                    f"{len(bad_corr)} correlation value(s) outside [-1, 1]. "
                    f"These values are mathematically impossible."
                ),
                evidence    = f"Invalid correlations: {bad_corr}.",
                suggestion  = "Correct these values before submission.",
            ))

        # p-value outside [0, 1]
        bad_p = [p for p in fp.p_values if p < 0 or p > 1]
        if bad_p:
            impossible.extend([f"p={p}" for p in bad_p])
            flags.append(DataFingerprintFlag(
                flag_type   = "impossible_p_value",
                severity    = "high",
                description = (
                    f"{len(bad_p)} p-value(s) outside [0, 1]. "
                    f"These values cannot exist."
                ),
                evidence    = f"Invalid p-values: {bad_p}.",
                suggestion  = "Check analysis code for unit or scale errors.",
            ))

        return impossible

    def _check_suspicious_duplicates(
        self, fp: NumericFingerprint, flags: list
    ) -> list:
        """
        The same specific decimal value appearing 3+ times in a paper
        is unusual unless it is a threshold or constant.
        In fabricated data, a single invented number gets reused.
        """
        all_vals = fp.means + fp.std_devs + fp.percentages + fp.correlations
        if len(all_vals) < 6:
            return []

        counter    = Counter(all_vals)
        duplicates = [
            v for v, count in counter.items()
            if count >= 3 and v not in (0.0, 1.0, 0.5, 100.0, 0.05)
        ]

        if duplicates:
            flags.append(DataFingerprintFlag(
                flag_type   = "suspicious_value_repetition",
                severity    = "medium",
                description = (
                    f"{len(duplicates)} specific value(s) appear 3 or more "
                    f"times across different reported statistics. "
                    f"Genuine independent measurements rarely share "
                    f"exact decimal values."
                ),
                evidence    = (
                    f"Repeated values: "
                    f"{[round(v, 4) for v in duplicates[:5]]}."
                ),
                suggestion  = (
                    "Verify that repeated values reflect genuinely "
                    "identical measurements and are not copy-paste artifacts."
                ),
            ))

        return duplicates

    # ── helpers ────────────────────────────────────────────────────────────────

    def _count_total(self, fp: NumericFingerprint) -> int:
        return (
            len(fp.means) + len(fp.std_devs) + len(fp.sample_sizes) +
            len(fp.percentages) + len(fp.correlations) + len(fp.p_values)
        )

    # ── scoring ────────────────────────────────────────────────────────────────

    def _compute_score(
        self,
        round_ratio:   float,
        terminal_bias: float,
        impossible:    list,
        duplicates:    list,
        total:         int,
    ) -> float:
        if total == 0:
            return 0.0

        impossible_score = min(len(impossible) * 0.25, 1.0)
        duplicate_score  = min(len(duplicates) * 0.15, 1.0)

        score = (
            round_ratio      * 0.25 +
            terminal_bias    * 0.25 +
            impossible_score * 0.35 +
            duplicate_score  * 0.15
        )
        return min(score, 1.0)

    def _get_risk_level(self, score: float) -> str:
        if score >= 0.70:   return "critical"
        if score >= 0.45:   return "high"
        if score >= 0.25:   return "medium"
        return "low"

    def _write_summary(
        self, flags: list, risk_level: str, total: int
    ) -> str:
        if total == 0:
            return (
                "Data Fingerprint Analysis: No statistical values extracted. "
                "Include explicit M=, SD=, N=, r=, and p= reporting "
                f"for full analysis. Risk level: {risk_level.upper()}."
            )

        if not flags:
            return (
                f"Data Fingerprint Analysis: {total} statistical value(s) "
                f"analyzed. No fabrication signals detected. "
                f"Numerical patterns appear consistent with genuine data. "
                f"Risk level: {risk_level.upper()}."
            )

        high   = sum(1 for f in flags if f.severity == "high")
        medium = sum(1 for f in flags if f.severity == "medium")
        parts  = []
        if high:
            parts.append(
                f"{high} impossible value{'s' if high > 1 else ''} detected"
            )
        if medium:
            parts.append(
                f"{medium} fabrication signal{'s' if medium > 1 else ''} found"
            )

        return (
            f"Data Fingerprint Analysis: {total} value(s) analyzed. "
            f"{'; '.join(parts)}. "
            f"Risk level: {risk_level.upper()}."
        )