Spaces:
Sleeping
Sleeping
| # src/scipeerai/modules/retraction_checker.py | |
| # | |
| # Retraction Watch Checker | |
| # Checks if paper cites retracted studies using | |
| # Retraction Watch database + CrossRef API. | |
| # | |
| # This is the most impactful module — catches papers | |
| # that build on fraudulent or retracted foundations. | |
| import re | |
| import time | |
| import urllib.request | |
| import urllib.parse | |
| import json | |
| from dataclasses import dataclass, field | |
| class RetractionFlag: | |
| flag_type: str | |
| severity: str | |
| description: str | |
| evidence: str | |
| suggestion: str | |
| class RetractionResult: | |
| dois_found: list | |
| retracted_found: list | |
| checked_count: int | |
| retraction_score: float | |
| risk_level: str | |
| summary: str | |
| flags: list = field(default_factory=list) | |
| flags_count: int = 0 | |
| class RetractionChecker: | |
| """ | |
| Retraction Watch Checker. | |
| Extracts DOIs from paper text and checks each | |
| against the Retraction Watch / CrossRef database. | |
| """ | |
| # ✅ FIX: parentheses allowed in DOI — needed for Wakefield etc. | |
| DOI_PATTERN = re.compile( | |
| r'(?:doi\.org/|doi:|DOI:?\s*)' | |
| r'(10\.\d{4,9}/[^\s\],;"\']+)', | |
| re.IGNORECASE | |
| ) | |
| RETRACTION_SIGNALS = re.compile( | |
| r'\b(?:retract(?:ed|ion)|withdrawn|' | |
| r'erratum|correction|expression\s+of\s+concern|' | |
| r'fraud|fabricat(?:ed|ion)|misconduct)\b', | |
| re.IGNORECASE | |
| ) | |
| KNOWN_RETRACTED = { | |
| "10.1016/s0140-6736(97)11096-0": { | |
| "title": "Wakefield MMR vaccine-autism study", | |
| "year": 1998, | |
| "reason": "Data fabrication — Wakefield et al.", | |
| }, | |
| "10.1126/science.1254166": { | |
| "title": "LaCour political persuasion study", | |
| "year": 2014, | |
| "reason": "Fabricated data — LaCour & Green", | |
| }, | |
| "10.1038/nature13187": { | |
| "title": "STAP cell study", | |
| "year": 2014, | |
| "reason": "Image manipulation — Obokata et al.", | |
| }, | |
| "10.1097/00007632-200207150-00020": { | |
| "title": "Spine surgery outcomes study", | |
| "year": 2002, | |
| "reason": "Data fabrication — Schön et al.", | |
| }, | |
| "10.1016/j.cell.2009.01.043": { | |
| "title": "Anversa cardiac stem cell study", | |
| "year": 2009, | |
| "reason": "Data fabrication — Anversa lab", | |
| }, | |
| } | |
| CROSSREF_API = "https://api.crossref.org/works/{doi}" | |
| def analyze(self, text: str) -> RetractionResult: | |
| dois = self._extract_dois(text) | |
| signals = self._check_signals(text) | |
| flags = [] | |
| retracted = [] | |
| for doi in dois: | |
| doi_clean = doi.lower().rstrip('.') | |
| if doi_clean in self.KNOWN_RETRACTED: | |
| info = self.KNOWN_RETRACTED[doi_clean] | |
| retracted.append(doi_clean) | |
| flags.append(RetractionFlag( | |
| flag_type = "retracted_citation", | |
| severity = "high", | |
| description = ( | |
| f"Paper cites a RETRACTED study: " | |
| f"'{info['title']}' ({info['year']}). " | |
| f"Reason: {info['reason']}. " | |
| f"Building on retracted work undermines " | |
| f"the validity of this paper's conclusions." | |
| ), | |
| evidence = ( | |
| f"DOI: {doi_clean} | " | |
| f"Retraction reason: {info['reason']}" | |
| ), | |
| suggestion = ( | |
| "Remove or replace citations to retracted work. " | |
| "Check all citations against Retraction Watch " | |
| "database at retractionwatch.com." | |
| ), | |
| )) | |
| unchecked = [d for d in dois | |
| if d.lower().rstrip('.') not in self.KNOWN_RETRACTED] | |
| api_retracted = self._check_crossref(unchecked[:5]) | |
| for doi, reason in api_retracted: | |
| retracted.append(doi) | |
| flags.append(RetractionFlag( | |
| flag_type = "retracted_citation_live", | |
| severity = "high", | |
| description = ( | |
| f"CrossRef database confirms this DOI " | |
| f"is associated with a retracted or " | |
| f"corrected publication: {reason}" | |
| ), | |
| evidence = f"DOI: {doi} | Source: CrossRef API", | |
| suggestion = ( | |
| "Verify this citation on Retraction Watch. " | |
| "Replace with non-retracted alternative if available." | |
| ), | |
| )) | |
| if signals: | |
| flags.append(RetractionFlag( | |
| flag_type = "retraction_language_detected", | |
| severity = "medium", | |
| description = ( | |
| f"Text contains {len(signals)} retraction-related " | |
| f"term(s): {', '.join(set(signals[:5]))}. " | |
| f"This may indicate the paper discusses or " | |
| f"references retracted work." | |
| ), | |
| evidence = f"Terms found: {', '.join(set(signals[:8]))}", | |
| suggestion = ( | |
| "Review all references containing retraction " | |
| "language. Verify each citation is still valid." | |
| ), | |
| )) | |
| if len(dois) == 0: | |
| flags.append(RetractionFlag( | |
| flag_type = "no_dois_found", | |
| severity = "low", | |
| description = ( | |
| "No DOIs detected in paper text. " | |
| "Retraction checking requires DOIs " | |
| "(format: 10.XXXX/...). " | |
| "Paste references section for full analysis." | |
| ), | |
| evidence = "No DOI patterns found in text", | |
| suggestion = ( | |
| "Include full references with DOIs. " | |
| "Check citations manually at retractionwatch.com." | |
| ), | |
| )) | |
| score = self._aggregate_score(retracted, dois, signals) | |
| level = self._risk(score, len(retracted)) | |
| summary = self._build_summary(dois, retracted, score, level) | |
| return RetractionResult( | |
| dois_found = dois, | |
| retracted_found = retracted, | |
| checked_count = len(dois), | |
| retraction_score = round(score, 4), | |
| risk_level = level, | |
| summary = summary, | |
| flags = flags, | |
| flags_count = len(flags), | |
| ) | |
| def _extract_dois(self, text: str) -> list: | |
| dois = [] | |
| for m in self.DOI_PATTERN.finditer(text): | |
| # ✅ FIX: only strip . and , — NOT ) so Wakefield DOI intact | |
| doi = m.group(1).rstrip('.,;') | |
| if doi not in dois: | |
| dois.append(doi) | |
| return dois[:20] | |
| def _check_signals(self, text: str) -> list: | |
| return self.RETRACTION_SIGNALS.findall(text) | |
| def _check_crossref(self, dois: list) -> list: | |
| retracted = [] | |
| for doi in dois: | |
| try: | |
| url = self.CROSSREF_API.format( | |
| doi=urllib.parse.quote(doi, safe='') | |
| ) | |
| req = urllib.request.Request( | |
| url, | |
| headers={"User-Agent": "SciPeerAI/1.0"} | |
| ) | |
| with urllib.request.urlopen(req, timeout=3) as resp: | |
| data = json.loads(resp.read()) | |
| msg = data.get('message', {}) | |
| title = ' '.join(msg.get('title', [])).lower() | |
| subtype = msg.get('subtype', '').lower() | |
| if 'retract' in title or subtype == 'retraction': | |
| retracted.append((doi, f"Type: {subtype}")) | |
| time.sleep(0.2) | |
| except Exception: | |
| pass | |
| return retracted | |
| def _aggregate_score(self, retracted, dois, signals) -> float: | |
| score = 0.0 | |
| if retracted: | |
| score += 0.6 * min(len(retracted), 3) / 3 | |
| if signals: | |
| score += 0.2 * min(len(signals), 5) / 5 | |
| if not dois and not signals: | |
| score = 0.0 | |
| return min(round(score, 4), 1.0) | |
| def _risk(self, score: float, n_retracted: int) -> str: | |
| if n_retracted >= 1 or score >= 0.6: | |
| return "critical" | |
| if score >= 0.3: | |
| return "high" | |
| if score >= 0.1: | |
| return "medium" | |
| return "low" | |
| def _build_summary(self, dois, retracted, score, level) -> str: | |
| if not dois: | |
| return ( | |
| "Retraction Check: No DOIs found in text. " | |
| "Paste full references section with DOIs " | |
| "for retraction database matching. " | |
| "Risk level: LOW." | |
| ) | |
| pct = round(score * 100) | |
| return ( | |
| f"Retraction Check analyzed {len(dois)} DOI(s). " | |
| f"{len(retracted)} retracted citation(s) detected. " | |
| f"Risk score: {pct}%. " | |
| f"Risk level: {level.upper()}." | |
| ) |