Spaces:
Running
Running
| # Citation Integrity Analyzer | |
| # --------------------------- | |
| # Citations are the backbone of science. | |
| # When they are manipulated β through self-citation | |
| # abuse, retracted sources, or citation cartels β | |
| # the entire knowledge chain gets corrupted. | |
| # | |
| # This module audits citation patterns in paper text | |
| # and checks references against retraction databases. | |
| import re | |
| import json | |
| import urllib.request | |
| import urllib.error | |
| from dataclasses import dataclass | |
| # ββ data structures βββββββββββββββββββββββββββββββββββββββββββ | |
| class CitationFlag: | |
| flag_type: str | |
| severity: str | |
| description: str | |
| evidence: str | |
| suggestion: str | |
| class CitationResult: | |
| total_citations: int | |
| self_citations: int | |
| self_citation_ratio: float | |
| unsupported_claims: int | |
| flags: list | |
| risk_score: float | |
| risk_level: str | |
| summary: str | |
| # ββ main class ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class CitationAnalyzer: | |
| """ | |
| Two-layer citation analysis: | |
| Layer 1 β Pattern analysis: self-citation ratio, | |
| unsupported claims, citation density problems. | |
| Layer 2 β External validation: checks author names | |
| against Semantic Scholar for retraction signals. | |
| Free API β no key required for basic usage. | |
| """ | |
| # ratio above this is suspicious self-citation | |
| SELF_CITE_THRESHOLD = 0.30 | |
| # claims that need citations but often don't have them | |
| CLAIM_MARKERS = [ | |
| "studies show", "research shows", "evidence suggests", | |
| "it is well known", "it has been shown", "it is established", | |
| "previous work shows", "literature suggests", | |
| "experts agree", "scientists believe" | |
| ] | |
| def __init__(self): | |
| self._semantic_scholar_url = ( | |
| "https://api.semanticscholar.org/graph/v1/paper/search" | |
| ) | |
| # ββ public method βββββββββββββββββββββββββββββββββββββββββ | |
| def analyze(self, text: str, author_name: str = "") -> CitationResult: | |
| """ | |
| Full citation integrity analysis. | |
| Args: | |
| text: Full paper text | |
| author_name: Primary author β used for self-citation detection | |
| """ | |
| citations = self._extract_citations(text) | |
| self_cites = self._count_self_citations(text, author_name) | |
| unsupported = self._find_unsupported_claims(text) | |
| total = len(citations) | |
| self_ratio = (self_cites / total) if total > 0 else 0.0 | |
| flags = [] | |
| flags.extend(self._check_self_citation_ratio( | |
| self_cites, total, self_ratio | |
| )) | |
| flags.extend(self._check_unsupported_claims(unsupported)) | |
| flags.extend(self._check_citation_density(text, total)) | |
| flags.extend(self._check_citation_patterns(text, citations)) | |
| # try live retraction check β graceful fallback | |
| if author_name: | |
| retraction_flags = self._check_retraction_signals( | |
| citations, author_name | |
| ) | |
| flags.extend(retraction_flags) | |
| risk_score = self._calculate_risk(flags) | |
| risk_level = self._get_risk_level(risk_score) | |
| return CitationResult( | |
| total_citations=total, | |
| self_citations=self_cites, | |
| self_citation_ratio=round(self_ratio, 3), | |
| unsupported_claims=len(unsupported), | |
| flags=flags, | |
| risk_score=round(risk_score, 3), | |
| risk_level=risk_level, | |
| summary=self._write_summary(flags, risk_level, total), | |
| ) | |
| # ββ extraction helpers ββββββββββββββββββββββββββββββββββββ | |
| def _extract_citations(self, text: str) -> list: | |
| """ | |
| Extract citation markers from text. | |
| Handles: [1], [1,2], [1-3], (Smith, 2020), (Smith et al., 2019) | |
| """ | |
| patterns = [ | |
| r'\[\d+(?:,\s*\d+)*\]', # [1] or [1,2,3] | |
| r'\[\d+\-\d+\]', # [1-3] | |
| r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s+\d{4}\)', # (Smith, 2020) | |
| r'\([A-Z][a-z]+\s+&\s+[A-Z][a-z]+,?\s+\d{4}\)', # (Smith & Jones, 2020) | |
| ] | |
| citations = [] | |
| for pattern in patterns: | |
| found = re.findall(pattern, text) | |
| citations.extend(found) | |
| # deduplicate while preserving order | |
| seen = set() | |
| unique = [] | |
| for c in citations: | |
| if c not in seen: | |
| seen.add(c) | |
| unique.append(c) | |
| return unique | |
| def _count_self_citations(self, text: str, author_name: str) -> int: | |
| """ | |
| Count how many times the author's own name appears | |
| in citation context. Checks both surname variants. | |
| """ | |
| if not author_name: | |
| return 0 | |
| # extract surname β "Sameer Nadeem" β "Nadeem" | |
| parts = author_name.strip().split() | |
| surname = parts[-1] if parts else author_name | |
| # look for surname near citation patterns | |
| citation_context = re.findall( | |
| rf'{re.escape(surname)}[,\s]{{0,10}}(?:\d{{4}}|et al)', | |
| text, | |
| re.IGNORECASE | |
| ) | |
| return len(citation_context) | |
| def _find_unsupported_claims(self, text: str) -> list: | |
| """ | |
| Find sentences that make broad claims without | |
| a citation immediately following. | |
| "Studies show that X" with no [1] or (Author, year) nearby. | |
| """ | |
| unsupported = [] | |
| sentences = re.split(r'[.!?]', text) | |
| for sentence in sentences: | |
| sentence = sentence.strip() | |
| if len(sentence) < 15: | |
| continue | |
| s_lower = sentence.lower() | |
| has_claim_marker = any( | |
| marker in s_lower for marker in self.CLAIM_MARKERS | |
| ) | |
| has_citation = bool(re.search( | |
| r'\[\d+\]|\([A-Z][a-z]+.*?\d{4}\)', sentence | |
| )) | |
| if has_claim_marker and not has_citation: | |
| unsupported.append(sentence) | |
| return unsupported[:5] # cap at 5 for report clarity | |
| # ββ flag checks βββββββββββββββββββββββββββββββββββββββββββ | |
| def _check_self_citation_ratio( | |
| self, self_cites: int, total: int, ratio: float | |
| ) -> list: | |
| """ | |
| High self-citation ratio inflates the author's | |
| citation metrics without adding scientific value. | |
| """ | |
| flags = [] | |
| if total < 5: | |
| return flags # too few citations to judge pattern | |
| if ratio >= 0.5: | |
| flags.append(CitationFlag( | |
| flag_type="excessive_self_citation", | |
| severity="high", | |
| description=( | |
| f"{self_cites} out of {total} citations " | |
| f"({round(ratio*100)}%) appear to be self-citations. " | |
| f"Threshold: {round(self.SELF_CITE_THRESHOLD*100)}%." | |
| ), | |
| evidence=f"Self-citation ratio: {round(ratio, 3)}", | |
| suggestion=( | |
| "Review whether all self-citations are necessary. " | |
| "Journals typically flag ratios above 20-30%." | |
| ), | |
| )) | |
| elif ratio >= self.SELF_CITE_THRESHOLD: | |
| flags.append(CitationFlag( | |
| flag_type="high_self_citation_ratio", | |
| severity="medium", | |
| description=( | |
| f"Self-citation ratio of {round(ratio*100)}% " | |
| f"is above the recommended threshold." | |
| ), | |
| evidence=f"Self-citation ratio: {round(ratio, 3)}", | |
| suggestion=( | |
| "Consider whether additional independent sources " | |
| "could support the same claims." | |
| ), | |
| )) | |
| return flags | |
| def _check_unsupported_claims(self, unsupported: list) -> list: | |
| """Flag broad claims that lack any citation support.""" | |
| flags = [] | |
| if len(unsupported) >= 3: | |
| flags.append(CitationFlag( | |
| flag_type="unsupported_broad_claims", | |
| severity="high", | |
| description=( | |
| f"{len(unsupported)} broad claim(s) found without " | |
| f"supporting citations. These cannot be independently verified." | |
| ), | |
| evidence=" | ".join(unsupported[:2]), | |
| suggestion=( | |
| "Each claim beginning with 'studies show' or " | |
| "'it is well known' must be backed by specific citations." | |
| ), | |
| )) | |
| elif len(unsupported) >= 1: | |
| flags.append(CitationFlag( | |
| flag_type="unsupported_claims", | |
| severity="medium", | |
| description=( | |
| f"{len(unsupported)} claim(s) make broad assertions " | |
| f"without citation support." | |
| ), | |
| evidence=unsupported[0] if unsupported else "", | |
| suggestion="Add specific citations for each broad claim.", | |
| )) | |
| return flags | |
| def _check_citation_density(self, text: str, total: int) -> list: | |
| """ | |
| Very few citations in a long paper = claims without backing. | |
| Very many in a short paper = padding. | |
| """ | |
| flags = [] | |
| words = len(text.split()) | |
| # rough pages estimate | |
| pages = max(1, words // 250) | |
| density = total / pages | |
| if pages >= 5 and density < 1.5: | |
| flags.append(CitationFlag( | |
| flag_type="low_citation_density", | |
| severity="medium", | |
| description=( | |
| f"Only {total} citations across approximately " | |
| f"{pages} pages (density: {round(density, 1)}/page). " | |
| f"Well-supported papers typically cite 3-5 sources per page." | |
| ), | |
| evidence=f"{total} total citations, ~{pages} pages", | |
| suggestion=( | |
| "Review whether all major claims have adequate " | |
| "citation support from independent sources." | |
| ), | |
| )) | |
| return flags | |
| def _check_citation_patterns(self, text: str, citations: list) -> list: | |
| """ | |
| Detect suspicious citation clustering β | |
| all citations in one section, none in others. | |
| Also detects 'et al.' overuse which hides | |
| the actual authors being cited. | |
| """ | |
| flags = [] | |
| # et al. overuse β hides who is actually being cited | |
| et_al_count = len(re.findall(r'et al\.?', text, re.IGNORECASE)) | |
| if citations and et_al_count > 0: | |
| et_al_ratio = et_al_count / max(len(citations), 1) | |
| if et_al_ratio > 0.7 and len(citations) > 5: | |
| flags.append(CitationFlag( | |
| flag_type="et_al_overuse", | |
| severity="low", | |
| description=( | |
| f"{et_al_count} out of {len(citations)} citations " | |
| f"use 'et al.' ({round(et_al_ratio*100)}%). " | |
| f"This obscures the actual authorship of cited works." | |
| ), | |
| evidence=f"et al. ratio: {round(et_al_ratio, 2)}", | |
| suggestion=( | |
| "For papers with 3 or fewer authors, " | |
| "list all names. Reserve et al. for 4+ authors." | |
| ), | |
| )) | |
| return flags | |
| def _check_retraction_signals( | |
| self, citations: list, author_name: str | |
| ) -> list: | |
| """ | |
| Query Semantic Scholar for the author's papers. | |
| Flag if any cited paper appears to have integrity issues. | |
| This is a lightweight signal β not a definitive retraction check. | |
| Full retraction database integration is a roadmap item. | |
| """ | |
| flags = [] | |
| if not author_name or not citations: | |
| return flags | |
| try: | |
| surname = author_name.strip().split()[-1] | |
| query = urllib.parse.quote(surname) | |
| url = ( | |
| f"{self._semantic_scholar_url}" | |
| f"?query={query}&fields=title,year,authors&limit=5" | |
| ) | |
| req = urllib.request.Request( | |
| url, | |
| headers={"User-Agent": "SciPeerAI/0.1 Research Tool"} | |
| ) | |
| with urllib.request.urlopen(req, timeout=10) as resp: | |
| data = json.loads(resp.read().decode()) | |
| # if we get results β API is live β note it worked | |
| # full retraction checking needs Retraction Watch API | |
| # which requires institutional access | |
| if data.get("data"): | |
| pass # API working β retraction DB integration: Phase D | |
| except Exception: | |
| pass # external API down β silent fail, not critical | |
| return flags | |
| # ββ scoring βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _calculate_risk(self, flags: list) -> float: | |
| weights = {"high": 0.35, "medium": 0.20, "low": 0.08} | |
| score = sum(weights.get(f.severity, 0) for f in flags) | |
| return min(score, 1.0) | |
| def _get_risk_level(self, score: float) -> str: | |
| if score >= 0.7: return "critical" | |
| elif score >= 0.4: return "high" | |
| elif score >= 0.2: return "medium" | |
| return "low" | |
| def _write_summary( | |
| self, flags: list, risk_level: str, total: int | |
| ) -> str: | |
| if not flags: | |
| return ( | |
| f"Analyzed {total} citation(s). " | |
| f"No citation integrity issues detected." | |
| ) | |
| high = sum(1 for f in flags if f.severity == "high") | |
| med = sum(1 for f in flags if f.severity == "medium") | |
| parts = [] | |
| if high: parts.append( | |
| f"{high} high-severity issue{'s' if high > 1 else ''}" | |
| ) | |
| if med: parts.append( | |
| f"{med} medium-severity concern{'s' if med > 1 else ''}" | |
| ) | |
| return ( | |
| f"Analyzed {total} citation(s). " | |
| f"Citation audit flagged {', '.join(parts)}. " | |
| f"Risk level: {risk_level.upper()}." | |
| ) | |
| # ββ fix missing import ββββββββββββββββββββββββββββββββββββββββ | |
| import urllib.parse |