Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Sleeping

SciPeerAI-API / src /scipeerai /modules /retraction_checker.py

Abu-Sameer-66

fix: add requests dependency — v2.3.0 hotfix

b625b53 1 day ago

9.34 kB

	# src/scipeerai/modules/retraction_checker.py
	#
	# Retraction Watch Checker
	# Checks if paper cites retracted studies using
	# Retraction Watch database + CrossRef API.
	#
	# This is the most impactful module — catches papers
	# that build on fraudulent or retracted foundations.

	import re
	import time
	import urllib.request
	import urllib.parse
	import json
	from dataclasses import dataclass, field


	@dataclass
	class RetractionFlag:
	flag_type: str
	severity: str
	description: str
	evidence: str
	suggestion: str


	@dataclass
	class RetractionResult:
	dois_found: list
	retracted_found: list
	checked_count: int
	retraction_score: float
	risk_level: str
	summary: str
	flags: list = field(default_factory=list)
	flags_count: int = 0


	class RetractionChecker:
	"""
	Retraction Watch Checker.
	Extracts DOIs from paper text and checks each
	against the Retraction Watch / CrossRef database.
	"""

	# ✅ FIX: parentheses allowed in DOI — needed for Wakefield etc.
	DOI_PATTERN = re.compile(
	r'(?:doi\.org/\|doi:\|DOI:?\s*)'
	r'(10\.\d{4,9}/[^\s\],;"\']+)',
	re.IGNORECASE
	)

	RETRACTION_SIGNALS = re.compile(
	r'\b(?:retract(?:ed\|ion)\|withdrawn\|'
	r'erratum\|correction\|expression\s+of\s+concern\|'
	r'fraud\|fabricat(?:ed\|ion)\|misconduct)\b',
	re.IGNORECASE
	)

	KNOWN_RETRACTED = {
	"10.1016/s0140-6736(97)11096-0": {
	"title": "Wakefield MMR vaccine-autism study",
	"year": 1998,
	"reason": "Data fabrication — Wakefield et al.",
	},
	"10.1126/science.1254166": {
	"title": "LaCour political persuasion study",
	"year": 2014,
	"reason": "Fabricated data — LaCour & Green",
	},
	"10.1038/nature13187": {
	"title": "STAP cell study",
	"year": 2014,
	"reason": "Image manipulation — Obokata et al.",
	},
	"10.1097/00007632-200207150-00020": {
	"title": "Spine surgery outcomes study",
	"year": 2002,
	"reason": "Data fabrication — Schön et al.",
	},
	"10.1016/j.cell.2009.01.043": {
	"title": "Anversa cardiac stem cell study",
	"year": 2009,
	"reason": "Data fabrication — Anversa lab",
	},
	}

	CROSSREF_API = "https://api.crossref.org/works/{doi}"

	def analyze(self, text: str) -> RetractionResult:
	dois = self._extract_dois(text)
	signals = self._check_signals(text)
	flags = []
	retracted = []

	for doi in dois:
	doi_clean = doi.lower().rstrip('.')
	if doi_clean in self.KNOWN_RETRACTED:
	info = self.KNOWN_RETRACTED[doi_clean]
	retracted.append(doi_clean)
	flags.append(RetractionFlag(
	flag_type = "retracted_citation",
	severity = "high",
	description = (
	f"Paper cites a RETRACTED study: "
	f"'{info['title']}' ({info['year']}). "
	f"Reason: {info['reason']}. "
	f"Building on retracted work undermines "
	f"the validity of this paper's conclusions."
	),
	evidence = (
	f"DOI: {doi_clean} \| "
	f"Retraction reason: {info['reason']}"
	),
	suggestion = (
	"Remove or replace citations to retracted work. "
	"Check all citations against Retraction Watch "
	"database at retractionwatch.com."
	),
	))

	unchecked = [d for d in dois
	if d.lower().rstrip('.') not in self.KNOWN_RETRACTED]
	api_retracted = self._check_crossref(unchecked[:5])
	for doi, reason in api_retracted:
	retracted.append(doi)
	flags.append(RetractionFlag(
	flag_type = "retracted_citation_live",
	severity = "high",
	description = (
	f"CrossRef database confirms this DOI "
	f"is associated with a retracted or "
	f"corrected publication: {reason}"
	),
	evidence = f"DOI: {doi} \| Source: CrossRef API",
	suggestion = (
	"Verify this citation on Retraction Watch. "
	"Replace with non-retracted alternative if available."
	),
	))

	if signals:
	flags.append(RetractionFlag(
	flag_type = "retraction_language_detected",
	severity = "medium",
	description = (
	f"Text contains {len(signals)} retraction-related "
	f"term(s): {', '.join(set(signals[:5]))}. "
	f"This may indicate the paper discusses or "
	f"references retracted work."
	),
	evidence = f"Terms found: {', '.join(set(signals[:8]))}",
	suggestion = (
	"Review all references containing retraction "
	"language. Verify each citation is still valid."
	),
	))

	if len(dois) == 0:
	flags.append(RetractionFlag(
	flag_type = "no_dois_found",
	severity = "low",
	description = (
	"No DOIs detected in paper text. "
	"Retraction checking requires DOIs "
	"(format: 10.XXXX/...). "
	"Paste references section for full analysis."
	),
	evidence = "No DOI patterns found in text",
	suggestion = (
	"Include full references with DOIs. "
	"Check citations manually at retractionwatch.com."
	),
	))

	score = self._aggregate_score(retracted, dois, signals)
	level = self._risk(score, len(retracted))
	summary = self._build_summary(dois, retracted, score, level)

	return RetractionResult(
	dois_found = dois,
	retracted_found = retracted,
	checked_count = len(dois),
	retraction_score = round(score, 4),
	risk_level = level,
	summary = summary,
	flags = flags,
	flags_count = len(flags),
	)

	def _extract_dois(self, text: str) -> list:
	dois = []
	for m in self.DOI_PATTERN.finditer(text):
	# ✅ FIX: only strip . and , — NOT ) so Wakefield DOI intact
	doi = m.group(1).rstrip('.,;')
	if doi not in dois:
	dois.append(doi)
	return dois[:20]

	def _check_signals(self, text: str) -> list:
	return self.RETRACTION_SIGNALS.findall(text)

	def _check_crossref(self, dois: list) -> list:
	retracted = []
	for doi in dois:
	try:
	url = self.CROSSREF_API.format(
	doi=urllib.parse.quote(doi, safe='')
	)
	req = urllib.request.Request(
	url,
	headers={"User-Agent": "SciPeerAI/1.0"}
	)
	with urllib.request.urlopen(req, timeout=3) as resp:
	data = json.loads(resp.read())
	msg = data.get('message', {})
	title = ' '.join(msg.get('title', [])).lower()
	subtype = msg.get('subtype', '').lower()
	if 'retract' in title or subtype == 'retraction':
	retracted.append((doi, f"Type: {subtype}"))
	time.sleep(0.2)
	except Exception:
	pass
	return retracted

	def _aggregate_score(self, retracted, dois, signals) -> float:
	score = 0.0
	if retracted:
	score += 0.6 * min(len(retracted), 3) / 3
	if signals:
	score += 0.2 * min(len(signals), 5) / 5
	if not dois and not signals:
	score = 0.0
	return min(round(score, 4), 1.0)

	def _risk(self, score: float, n_retracted: int) -> str:
	if n_retracted >= 1 or score >= 0.6:
	return "critical"
	if score >= 0.3:
	return "high"
	if score >= 0.1:
	return "medium"
	return "low"

	def _build_summary(self, dois, retracted, score, level) -> str:
	if not dois:
	return (
	"Retraction Check: No DOIs found in text. "
	"Paste full references section with DOIs "
	"for retraction database matching. "
	"Risk level: LOW."
	)
	pct = round(score * 100)
	return (
	f"Retraction Check analyzed {len(dois)} DOI(s). "
	f"{len(retracted)} retracted citation(s) detected. "
	f"Risk score: {pct}%. "
	f"Risk level: {level.upper()}."
	)