File size: 9,340 Bytes
b5c2bb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# src/scipeerai/modules/retraction_checker.py
#
# Retraction Watch Checker
# Checks if paper cites retracted studies using
# Retraction Watch database + CrossRef API.
#
# This is the most impactful module — catches papers
# that build on fraudulent or retracted foundations.

import re
import time
import urllib.request
import urllib.parse
import json
from dataclasses import dataclass, field


@dataclass
class RetractionFlag:
    flag_type:   str
    severity:    str
    description: str
    evidence:    str
    suggestion:  str


@dataclass
class RetractionResult:
    dois_found:          list
    retracted_found:     list
    checked_count:       int
    retraction_score:    float
    risk_level:          str
    summary:             str
    flags:               list = field(default_factory=list)
    flags_count:         int  = 0


class RetractionChecker:
    """
    Retraction Watch Checker.
    Extracts DOIs from paper text and checks each
    against the Retraction Watch / CrossRef database.
    """

    # ✅ FIX: parentheses allowed in DOI — needed for Wakefield etc.
    DOI_PATTERN = re.compile(
        r'(?:doi\.org/|doi:|DOI:?\s*)'
        r'(10\.\d{4,9}/[^\s\],;"\']+)',
        re.IGNORECASE
    )

    RETRACTION_SIGNALS = re.compile(
        r'\b(?:retract(?:ed|ion)|withdrawn|'
        r'erratum|correction|expression\s+of\s+concern|'
        r'fraud|fabricat(?:ed|ion)|misconduct)\b',
        re.IGNORECASE
    )

    KNOWN_RETRACTED = {
        "10.1016/s0140-6736(97)11096-0": {
            "title":  "Wakefield MMR vaccine-autism study",
            "year":   1998,
            "reason": "Data fabrication — Wakefield et al.",
        },
        "10.1126/science.1254166": {
            "title":  "LaCour political persuasion study",
            "year":   2014,
            "reason": "Fabricated data — LaCour & Green",
        },
        "10.1038/nature13187": {
            "title":  "STAP cell study",
            "year":   2014,
            "reason": "Image manipulation — Obokata et al.",
        },
        "10.1097/00007632-200207150-00020": {
            "title":  "Spine surgery outcomes study",
            "year":   2002,
            "reason": "Data fabrication — Schön et al.",
        },
        "10.1016/j.cell.2009.01.043": {
            "title":  "Anversa cardiac stem cell study",
            "year":   2009,
            "reason": "Data fabrication — Anversa lab",
        },
    }

    CROSSREF_API = "https://api.crossref.org/works/{doi}"

    def analyze(self, text: str) -> RetractionResult:
        dois      = self._extract_dois(text)
        signals   = self._check_signals(text)
        flags     = []
        retracted = []

        for doi in dois:
            doi_clean = doi.lower().rstrip('.')
            if doi_clean in self.KNOWN_RETRACTED:
                info = self.KNOWN_RETRACTED[doi_clean]
                retracted.append(doi_clean)
                flags.append(RetractionFlag(
                    flag_type   = "retracted_citation",
                    severity    = "high",
                    description = (
                        f"Paper cites a RETRACTED study: "
                        f"'{info['title']}' ({info['year']}). "
                        f"Reason: {info['reason']}. "
                        f"Building on retracted work undermines "
                        f"the validity of this paper's conclusions."
                    ),
                    evidence    = (
                        f"DOI: {doi_clean} | "
                        f"Retraction reason: {info['reason']}"
                    ),
                    suggestion  = (
                        "Remove or replace citations to retracted work. "
                        "Check all citations against Retraction Watch "
                        "database at retractionwatch.com."
                    ),
                ))

        unchecked     = [d for d in dois
                         if d.lower().rstrip('.') not in self.KNOWN_RETRACTED]
        api_retracted = self._check_crossref(unchecked[:5])
        for doi, reason in api_retracted:
            retracted.append(doi)
            flags.append(RetractionFlag(
                flag_type   = "retracted_citation_live",
                severity    = "high",
                description = (
                    f"CrossRef database confirms this DOI "
                    f"is associated with a retracted or "
                    f"corrected publication: {reason}"
                ),
                evidence    = f"DOI: {doi} | Source: CrossRef API",
                suggestion  = (
                    "Verify this citation on Retraction Watch. "
                    "Replace with non-retracted alternative if available."
                ),
            ))

        if signals:
            flags.append(RetractionFlag(
                flag_type   = "retraction_language_detected",
                severity    = "medium",
                description = (
                    f"Text contains {len(signals)} retraction-related "
                    f"term(s): {', '.join(set(signals[:5]))}. "
                    f"This may indicate the paper discusses or "
                    f"references retracted work."
                ),
                evidence    = f"Terms found: {', '.join(set(signals[:8]))}",
                suggestion  = (
                    "Review all references containing retraction "
                    "language. Verify each citation is still valid."
                ),
            ))

        if len(dois) == 0:
            flags.append(RetractionFlag(
                flag_type   = "no_dois_found",
                severity    = "low",
                description = (
                    "No DOIs detected in paper text. "
                    "Retraction checking requires DOIs "
                    "(format: 10.XXXX/...). "
                    "Paste references section for full analysis."
                ),
                evidence    = "No DOI patterns found in text",
                suggestion  = (
                    "Include full references with DOIs. "
                    "Check citations manually at retractionwatch.com."
                ),
            ))

        score   = self._aggregate_score(retracted, dois, signals)
        level   = self._risk(score, len(retracted))
        summary = self._build_summary(dois, retracted, score, level)

        return RetractionResult(
            dois_found       = dois,
            retracted_found  = retracted,
            checked_count    = len(dois),
            retraction_score = round(score, 4),
            risk_level       = level,
            summary          = summary,
            flags            = flags,
            flags_count      = len(flags),
        )

    def _extract_dois(self, text: str) -> list:
        dois = []
        for m in self.DOI_PATTERN.finditer(text):
            # ✅ FIX: only strip . and , — NOT ) so Wakefield DOI intact
            doi = m.group(1).rstrip('.,;')
            if doi not in dois:
                dois.append(doi)
        return dois[:20]

    def _check_signals(self, text: str) -> list:
        return self.RETRACTION_SIGNALS.findall(text)

    def _check_crossref(self, dois: list) -> list:
        retracted = []
        for doi in dois:
            try:
                url = self.CROSSREF_API.format(
                    doi=urllib.parse.quote(doi, safe='')
                )
                req = urllib.request.Request(
                    url,
                    headers={"User-Agent": "SciPeerAI/1.0"}
                )
                with urllib.request.urlopen(req, timeout=3) as resp:
                    data    = json.loads(resp.read())
                    msg     = data.get('message', {})
                    title   = ' '.join(msg.get('title', [])).lower()
                    subtype = msg.get('subtype', '').lower()
                    if 'retract' in title or subtype == 'retraction':
                        retracted.append((doi, f"Type: {subtype}"))
                time.sleep(0.2)
            except Exception:
                pass
        return retracted

    def _aggregate_score(self, retracted, dois, signals) -> float:
        score = 0.0
        if retracted:
            score += 0.6 * min(len(retracted), 3) / 3
        if signals:
            score += 0.2 * min(len(signals), 5) / 5
        if not dois and not signals:
            score = 0.0
        return min(round(score, 4), 1.0)

    def _risk(self, score: float, n_retracted: int) -> str:
        if n_retracted >= 1 or score >= 0.6:
            return "critical"
        if score >= 0.3:
            return "high"
        if score >= 0.1:
            return "medium"
        return "low"

    def _build_summary(self, dois, retracted, score, level) -> str:
        if not dois:
            return (
                "Retraction Check: No DOIs found in text. "
                "Paste full references section with DOIs "
                "for retraction database matching. "
                "Risk level: LOW."
            )
        pct = round(score * 100)
        return (
            f"Retraction Check analyzed {len(dois)} DOI(s). "
            f"{len(retracted)} retracted citation(s) detected. "
            f"Risk score: {pct}%. "
            f"Risk level: {level.upper()}."
        )