SciPeerAI-API / src /scipeerai /modules /retraction_checker.py
Abu-Sameer-66
fix: add requests dependency — v2.3.0 hotfix
b625b53
# src/scipeerai/modules/retraction_checker.py
#
# Retraction Watch Checker
# Checks if paper cites retracted studies using
# Retraction Watch database + CrossRef API.
#
# This is the most impactful module — catches papers
# that build on fraudulent or retracted foundations.
import re
import time
import urllib.request
import urllib.parse
import json
from dataclasses import dataclass, field
@dataclass
class RetractionFlag:
flag_type: str
severity: str
description: str
evidence: str
suggestion: str
@dataclass
class RetractionResult:
dois_found: list
retracted_found: list
checked_count: int
retraction_score: float
risk_level: str
summary: str
flags: list = field(default_factory=list)
flags_count: int = 0
class RetractionChecker:
"""
Retraction Watch Checker.
Extracts DOIs from paper text and checks each
against the Retraction Watch / CrossRef database.
"""
# ✅ FIX: parentheses allowed in DOI — needed for Wakefield etc.
DOI_PATTERN = re.compile(
r'(?:doi\.org/|doi:|DOI:?\s*)'
r'(10\.\d{4,9}/[^\s\],;"\']+)',
re.IGNORECASE
)
RETRACTION_SIGNALS = re.compile(
r'\b(?:retract(?:ed|ion)|withdrawn|'
r'erratum|correction|expression\s+of\s+concern|'
r'fraud|fabricat(?:ed|ion)|misconduct)\b',
re.IGNORECASE
)
KNOWN_RETRACTED = {
"10.1016/s0140-6736(97)11096-0": {
"title": "Wakefield MMR vaccine-autism study",
"year": 1998,
"reason": "Data fabrication — Wakefield et al.",
},
"10.1126/science.1254166": {
"title": "LaCour political persuasion study",
"year": 2014,
"reason": "Fabricated data — LaCour & Green",
},
"10.1038/nature13187": {
"title": "STAP cell study",
"year": 2014,
"reason": "Image manipulation — Obokata et al.",
},
"10.1097/00007632-200207150-00020": {
"title": "Spine surgery outcomes study",
"year": 2002,
"reason": "Data fabrication — Schön et al.",
},
"10.1016/j.cell.2009.01.043": {
"title": "Anversa cardiac stem cell study",
"year": 2009,
"reason": "Data fabrication — Anversa lab",
},
}
CROSSREF_API = "https://api.crossref.org/works/{doi}"
def analyze(self, text: str) -> RetractionResult:
dois = self._extract_dois(text)
signals = self._check_signals(text)
flags = []
retracted = []
for doi in dois:
doi_clean = doi.lower().rstrip('.')
if doi_clean in self.KNOWN_RETRACTED:
info = self.KNOWN_RETRACTED[doi_clean]
retracted.append(doi_clean)
flags.append(RetractionFlag(
flag_type = "retracted_citation",
severity = "high",
description = (
f"Paper cites a RETRACTED study: "
f"'{info['title']}' ({info['year']}). "
f"Reason: {info['reason']}. "
f"Building on retracted work undermines "
f"the validity of this paper's conclusions."
),
evidence = (
f"DOI: {doi_clean} | "
f"Retraction reason: {info['reason']}"
),
suggestion = (
"Remove or replace citations to retracted work. "
"Check all citations against Retraction Watch "
"database at retractionwatch.com."
),
))
unchecked = [d for d in dois
if d.lower().rstrip('.') not in self.KNOWN_RETRACTED]
api_retracted = self._check_crossref(unchecked[:5])
for doi, reason in api_retracted:
retracted.append(doi)
flags.append(RetractionFlag(
flag_type = "retracted_citation_live",
severity = "high",
description = (
f"CrossRef database confirms this DOI "
f"is associated with a retracted or "
f"corrected publication: {reason}"
),
evidence = f"DOI: {doi} | Source: CrossRef API",
suggestion = (
"Verify this citation on Retraction Watch. "
"Replace with non-retracted alternative if available."
),
))
if signals:
flags.append(RetractionFlag(
flag_type = "retraction_language_detected",
severity = "medium",
description = (
f"Text contains {len(signals)} retraction-related "
f"term(s): {', '.join(set(signals[:5]))}. "
f"This may indicate the paper discusses or "
f"references retracted work."
),
evidence = f"Terms found: {', '.join(set(signals[:8]))}",
suggestion = (
"Review all references containing retraction "
"language. Verify each citation is still valid."
),
))
if len(dois) == 0:
flags.append(RetractionFlag(
flag_type = "no_dois_found",
severity = "low",
description = (
"No DOIs detected in paper text. "
"Retraction checking requires DOIs "
"(format: 10.XXXX/...). "
"Paste references section for full analysis."
),
evidence = "No DOI patterns found in text",
suggestion = (
"Include full references with DOIs. "
"Check citations manually at retractionwatch.com."
),
))
score = self._aggregate_score(retracted, dois, signals)
level = self._risk(score, len(retracted))
summary = self._build_summary(dois, retracted, score, level)
return RetractionResult(
dois_found = dois,
retracted_found = retracted,
checked_count = len(dois),
retraction_score = round(score, 4),
risk_level = level,
summary = summary,
flags = flags,
flags_count = len(flags),
)
def _extract_dois(self, text: str) -> list:
dois = []
for m in self.DOI_PATTERN.finditer(text):
# ✅ FIX: only strip . and , — NOT ) so Wakefield DOI intact
doi = m.group(1).rstrip('.,;')
if doi not in dois:
dois.append(doi)
return dois[:20]
def _check_signals(self, text: str) -> list:
return self.RETRACTION_SIGNALS.findall(text)
def _check_crossref(self, dois: list) -> list:
retracted = []
for doi in dois:
try:
url = self.CROSSREF_API.format(
doi=urllib.parse.quote(doi, safe='')
)
req = urllib.request.Request(
url,
headers={"User-Agent": "SciPeerAI/1.0"}
)
with urllib.request.urlopen(req, timeout=3) as resp:
data = json.loads(resp.read())
msg = data.get('message', {})
title = ' '.join(msg.get('title', [])).lower()
subtype = msg.get('subtype', '').lower()
if 'retract' in title or subtype == 'retraction':
retracted.append((doi, f"Type: {subtype}"))
time.sleep(0.2)
except Exception:
pass
return retracted
def _aggregate_score(self, retracted, dois, signals) -> float:
score = 0.0
if retracted:
score += 0.6 * min(len(retracted), 3) / 3
if signals:
score += 0.2 * min(len(signals), 5) / 5
if not dois and not signals:
score = 0.0
return min(round(score, 4), 1.0)
def _risk(self, score: float, n_retracted: int) -> str:
if n_retracted >= 1 or score >= 0.6:
return "critical"
if score >= 0.3:
return "high"
if score >= 0.1:
return "medium"
return "low"
def _build_summary(self, dois, retracted, score, level) -> str:
if not dois:
return (
"Retraction Check: No DOIs found in text. "
"Paste full references section with DOIs "
"for retraction database matching. "
"Risk level: LOW."
)
pct = round(score * 100)
return (
f"Retraction Check analyzed {len(dois)} DOI(s). "
f"{len(retracted)} retracted citation(s) detected. "
f"Risk score: {pct}%. "
f"Risk level: {level.upper()}."
)