Spaces:

Abu-Sameer-66
/

SciPeerAI-API

Sleeping

File size: 9,340 Bytes

b5c2bb1

# src/scipeerai/modules/retraction_checker.py
#
# Retraction Watch Checker
# Checks if paper cites retracted studies using
# Retraction Watch database + CrossRef API.
#
# This is the most impactful module — catches papers
# that build on fraudulent or retracted foundations.

import re
import time
import urllib.request
import urllib.parse
import json
from dataclasses import dataclass, field


@dataclass
class RetractionFlag:
    flag_type:   str
    severity:    str
    description: str
    evidence:    str
    suggestion:  str


@dataclass
class RetractionResult:
    dois_found:          list
    retracted_found:     list
    checked_count:       int
    retraction_score:    float
    risk_level:          str
    summary:             str
    flags:               list = field(default_factory=list)
    flags_count:         int  = 0


class RetractionChecker:
    """
    Retraction Watch Checker.
    Extracts DOIs from paper text and checks each
    against the Retraction Watch / CrossRef database.
    """

    # ✅ FIX: parentheses allowed in DOI — needed for Wakefield etc.
    DOI_PATTERN = re.compile(
        r'(?:doi\.org/|doi:|DOI:?\s*)'
        r'(10\.\d{4,9}/[^\s\],;"\']+)',
        re.IGNORECASE
    )

    RETRACTION_SIGNALS = re.compile(
        r'\b(?:retract(?:ed|ion)|withdrawn|'
        r'erratum|correction|expression\s+of\s+concern|'
        r'fraud|fabricat(?:ed|ion)|misconduct)\b',
        re.IGNORECASE
    )

    KNOWN_RETRACTED = {
        "10.1016/s0140-6736(97)11096-0": {
            "title":  "Wakefield MMR vaccine-autism study",
            "year":   1998,
            "reason": "Data fabrication — Wakefield et al.",
        },
        "10.1126/science.1254166": {
            "title":  "LaCour political persuasion study",
            "year":   2014,
            "reason": "Fabricated data — LaCour & Green",
        },
        "10.1038/nature13187": {
            "title":  "STAP cell study",
            "year":   2014,
            "reason": "Image manipulation — Obokata et al.",
        },
        "10.1097/00007632-200207150-00020": {
            "title":  "Spine surgery outcomes study",
            "year":   2002,
            "reason": "Data fabrication — Schön et al.",
        },
        "10.1016/j.cell.2009.01.043": {
            "title":  "Anversa cardiac stem cell study",
            "year":   2009,
            "reason": "Data fabrication — Anversa lab",
        },
    }

    CROSSREF_API = "https://api.crossref.org/works/{doi}"

    def analyze(self, text: str) -> RetractionResult:
        dois      = self._extract_dois(text)
        signals   = self._check_signals(text)
        flags     = []
        retracted = []

        for doi in dois:
            doi_clean = doi.lower().rstrip('.')
            if doi_clean in self.KNOWN_RETRACTED:
                info = self.KNOWN_RETRACTED[doi_clean]
                retracted.append(doi_clean)
                flags.append(RetractionFlag(
                    flag_type   = "retracted_citation",
                    severity    = "high",
                    description = (
                        f"Paper cites a RETRACTED study: "
                        f"'{info['title']}' ({info['year']}). "
                        f"Reason: {info['reason']}. "
                        f"Building on retracted work undermines "
                        f"the validity of this paper's conclusions."
                    ),
                    evidence    = (
                        f"DOI: {doi_clean} | "
                        f"Retraction reason: {info['reason']}"
                    ),
                    suggestion  = (
                        "Remove or replace citations to retracted work. "
                        "Check all citations against Retraction Watch "
                        "database at retractionwatch.com."
                    ),
                ))

        unchecked     = [d for d in dois
                         if d.lower().rstrip('.') not in self.KNOWN_RETRACTED]
        api_retracted = self._check_crossref(unchecked[:5])
        for doi, reason in api_retracted:
            retracted.append(doi)
            flags.append(RetractionFlag(
                flag_type   = "retracted_citation_live",
                severity    = "high",
                description = (
                    f"CrossRef database confirms this DOI "
                    f"is associated with a retracted or "
                    f"corrected publication: {reason}"
                ),
                evidence    = f"DOI: {doi} | Source: CrossRef API",
                suggestion  = (
                    "Verify this citation on Retraction Watch. "
                    "Replace with non-retracted alternative if available."
                ),
            ))

        if signals:
            flags.append(RetractionFlag(
                flag_type   = "retraction_language_detected",
                severity    = "medium",
                description = (
                    f"Text contains {len(signals)} retraction-related "
                    f"term(s): {', '.join(set(signals[:5]))}. "
                    f"This may indicate the paper discusses or "
                    f"references retracted work."
                ),
                evidence    = f"Terms found: {', '.join(set(signals[:8]))}",
                suggestion  = (
                    "Review all references containing retraction "
                    "language. Verify each citation is still valid."
                ),
            ))

        if len(dois) == 0:
            flags.append(RetractionFlag(
                flag_type   = "no_dois_found",
                severity    = "low",
                description = (
                    "No DOIs detected in paper text. "
                    "Retraction checking requires DOIs "
                    "(format: 10.XXXX/...). "
                    "Paste references section for full analysis."
                ),
                evidence    = "No DOI patterns found in text",
                suggestion  = (
                    "Include full references with DOIs. "
                    "Check citations manually at retractionwatch.com."
                ),
            ))

        score   = self._aggregate_score(retracted, dois, signals)
        level   = self._risk(score, len(retracted))
        summary = self._build_summary(dois, retracted, score, level)

        return RetractionResult(
            dois_found       = dois,
            retracted_found  = retracted,
            checked_count    = len(dois),
            retraction_score = round(score, 4),
            risk_level       = level,
            summary          = summary,
            flags            = flags,
            flags_count      = len(flags),
        )

    def _extract_dois(self, text: str) -> list:
        dois = []
        for m in self.DOI_PATTERN.finditer(text):
            # ✅ FIX: only strip . and , — NOT ) so Wakefield DOI intact
            doi = m.group(1).rstrip('.,;')
            if doi not in dois:
                dois.append(doi)
        return dois[:20]

    def _check_signals(self, text: str) -> list:
        return self.RETRACTION_SIGNALS.findall(text)

    def _check_crossref(self, dois: list) -> list:
        retracted = []
        for doi in dois:
            try:
                url = self.CROSSREF_API.format(
                    doi=urllib.parse.quote(doi, safe='')
                )
                req = urllib.request.Request(
                    url,
                    headers={"User-Agent": "SciPeerAI/1.0"}
                )
                with urllib.request.urlopen(req, timeout=3) as resp:
                    data    = json.loads(resp.read())
                    msg     = data.get('message', {})
                    title   = ' '.join(msg.get('title', [])).lower()
                    subtype = msg.get('subtype', '').lower()
                    if 'retract' in title or subtype == 'retraction':
                        retracted.append((doi, f"Type: {subtype}"))
                time.sleep(0.2)
            except Exception:
                pass
        return retracted

    def _aggregate_score(self, retracted, dois, signals) -> float:
        score = 0.0
        if retracted:
            score += 0.6 * min(len(retracted), 3) / 3
        if signals:
            score += 0.2 * min(len(signals), 5) / 5
        if not dois and not signals:
            score = 0.0
        return min(round(score, 4), 1.0)

    def _risk(self, score: float, n_retracted: int) -> str:
        if n_retracted >= 1 or score >= 0.6:
            return "critical"
        if score >= 0.3:
            return "high"
        if score >= 0.1:
            return "medium"
        return "low"

    def _build_summary(self, dois, retracted, score, level) -> str:
        if not dois:
            return (
                "Retraction Check: No DOIs found in text. "
                "Paste full references section with DOIs "
                "for retraction database matching. "
                "Risk level: LOW."
            )
        pct = round(score * 100)
        return (
            f"Retraction Check analyzed {len(dois)} DOI(s). "
            f"{len(retracted)} retracted citation(s) detected. "
            f"Risk score: {pct}%. "
            f"Risk level: {level.upper()}."
        )