SciPeerAI-API / src /scipeerai /modules /citation_analyzer.py
Abu-Sameer-66
fix: add requests dependency β€” v2.3.0 hotfix
b625b53
# Citation Integrity Analyzer
# ---------------------------
# Citations are the backbone of science.
# When they are manipulated β€” through self-citation
# abuse, retracted sources, or citation cartels β€”
# the entire knowledge chain gets corrupted.
#
# This module audits citation patterns in paper text
# and checks references against retraction databases.
import re
import json
import urllib.request
import urllib.error
from dataclasses import dataclass
# ── data structures ───────────────────────────────────────────
@dataclass
class CitationFlag:
flag_type: str
severity: str
description: str
evidence: str
suggestion: str
@dataclass
class CitationResult:
total_citations: int
self_citations: int
self_citation_ratio: float
unsupported_claims: int
flags: list
risk_score: float
risk_level: str
summary: str
# ── main class ────────────────────────────────────────────────
class CitationAnalyzer:
"""
Two-layer citation analysis:
Layer 1 β€” Pattern analysis: self-citation ratio,
unsupported claims, citation density problems.
Layer 2 β€” External validation: checks author names
against Semantic Scholar for retraction signals.
Free API β€” no key required for basic usage.
"""
# ratio above this is suspicious self-citation
SELF_CITE_THRESHOLD = 0.30
# claims that need citations but often don't have them
CLAIM_MARKERS = [
"studies show", "research shows", "evidence suggests",
"it is well known", "it has been shown", "it is established",
"previous work shows", "literature suggests",
"experts agree", "scientists believe"
]
def __init__(self):
self._semantic_scholar_url = (
"https://api.semanticscholar.org/graph/v1/paper/search"
)
# ── public method ─────────────────────────────────────────
def analyze(self, text: str, author_name: str = "") -> CitationResult:
"""
Full citation integrity analysis.
Args:
text: Full paper text
author_name: Primary author β€” used for self-citation detection
"""
citations = self._extract_citations(text)
self_cites = self._count_self_citations(text, author_name)
unsupported = self._find_unsupported_claims(text)
total = len(citations)
self_ratio = (self_cites / total) if total > 0 else 0.0
flags = []
flags.extend(self._check_self_citation_ratio(
self_cites, total, self_ratio
))
flags.extend(self._check_unsupported_claims(unsupported))
flags.extend(self._check_citation_density(text, total))
flags.extend(self._check_citation_patterns(text, citations))
# try live retraction check β€” graceful fallback
if author_name:
retraction_flags = self._check_retraction_signals(
citations, author_name
)
flags.extend(retraction_flags)
risk_score = self._calculate_risk(flags)
risk_level = self._get_risk_level(risk_score)
return CitationResult(
total_citations=total,
self_citations=self_cites,
self_citation_ratio=round(self_ratio, 3),
unsupported_claims=len(unsupported),
flags=flags,
risk_score=round(risk_score, 3),
risk_level=risk_level,
summary=self._write_summary(flags, risk_level, total),
)
# ── extraction helpers ────────────────────────────────────
def _extract_citations(self, text: str) -> list:
"""
Extract citation markers from text.
Handles: [1], [1,2], [1-3], (Smith, 2020), (Smith et al., 2019)
"""
patterns = [
r'\[\d+(?:,\s*\d+)*\]', # [1] or [1,2,3]
r'\[\d+\-\d+\]', # [1-3]
r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s+\d{4}\)', # (Smith, 2020)
r'\([A-Z][a-z]+\s+&\s+[A-Z][a-z]+,?\s+\d{4}\)', # (Smith & Jones, 2020)
]
citations = []
for pattern in patterns:
found = re.findall(pattern, text)
citations.extend(found)
# deduplicate while preserving order
seen = set()
unique = []
for c in citations:
if c not in seen:
seen.add(c)
unique.append(c)
return unique
def _count_self_citations(self, text: str, author_name: str) -> int:
"""
Count how many times the author's own name appears
in citation context. Checks both surname variants.
"""
if not author_name:
return 0
# extract surname β€” "Sameer Nadeem" β†’ "Nadeem"
parts = author_name.strip().split()
surname = parts[-1] if parts else author_name
# look for surname near citation patterns
citation_context = re.findall(
rf'{re.escape(surname)}[,\s]{{0,10}}(?:\d{{4}}|et al)',
text,
re.IGNORECASE
)
return len(citation_context)
def _find_unsupported_claims(self, text: str) -> list:
"""
Find sentences that make broad claims without
a citation immediately following.
"Studies show that X" with no [1] or (Author, year) nearby.
"""
unsupported = []
sentences = re.split(r'[.!?]', text)
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) < 15:
continue
s_lower = sentence.lower()
has_claim_marker = any(
marker in s_lower for marker in self.CLAIM_MARKERS
)
has_citation = bool(re.search(
r'\[\d+\]|\([A-Z][a-z]+.*?\d{4}\)', sentence
))
if has_claim_marker and not has_citation:
unsupported.append(sentence)
return unsupported[:5] # cap at 5 for report clarity
# ── flag checks ───────────────────────────────────────────
def _check_self_citation_ratio(
self, self_cites: int, total: int, ratio: float
) -> list:
"""
High self-citation ratio inflates the author's
citation metrics without adding scientific value.
"""
flags = []
if total < 5:
return flags # too few citations to judge pattern
if ratio >= 0.5:
flags.append(CitationFlag(
flag_type="excessive_self_citation",
severity="high",
description=(
f"{self_cites} out of {total} citations "
f"({round(ratio*100)}%) appear to be self-citations. "
f"Threshold: {round(self.SELF_CITE_THRESHOLD*100)}%."
),
evidence=f"Self-citation ratio: {round(ratio, 3)}",
suggestion=(
"Review whether all self-citations are necessary. "
"Journals typically flag ratios above 20-30%."
),
))
elif ratio >= self.SELF_CITE_THRESHOLD:
flags.append(CitationFlag(
flag_type="high_self_citation_ratio",
severity="medium",
description=(
f"Self-citation ratio of {round(ratio*100)}% "
f"is above the recommended threshold."
),
evidence=f"Self-citation ratio: {round(ratio, 3)}",
suggestion=(
"Consider whether additional independent sources "
"could support the same claims."
),
))
return flags
def _check_unsupported_claims(self, unsupported: list) -> list:
"""Flag broad claims that lack any citation support."""
flags = []
if len(unsupported) >= 3:
flags.append(CitationFlag(
flag_type="unsupported_broad_claims",
severity="high",
description=(
f"{len(unsupported)} broad claim(s) found without "
f"supporting citations. These cannot be independently verified."
),
evidence=" | ".join(unsupported[:2]),
suggestion=(
"Each claim beginning with 'studies show' or "
"'it is well known' must be backed by specific citations."
),
))
elif len(unsupported) >= 1:
flags.append(CitationFlag(
flag_type="unsupported_claims",
severity="medium",
description=(
f"{len(unsupported)} claim(s) make broad assertions "
f"without citation support."
),
evidence=unsupported[0] if unsupported else "",
suggestion="Add specific citations for each broad claim.",
))
return flags
def _check_citation_density(self, text: str, total: int) -> list:
"""
Very few citations in a long paper = claims without backing.
Very many in a short paper = padding.
"""
flags = []
words = len(text.split())
# rough pages estimate
pages = max(1, words // 250)
density = total / pages
if pages >= 5 and density < 1.5:
flags.append(CitationFlag(
flag_type="low_citation_density",
severity="medium",
description=(
f"Only {total} citations across approximately "
f"{pages} pages (density: {round(density, 1)}/page). "
f"Well-supported papers typically cite 3-5 sources per page."
),
evidence=f"{total} total citations, ~{pages} pages",
suggestion=(
"Review whether all major claims have adequate "
"citation support from independent sources."
),
))
return flags
def _check_citation_patterns(self, text: str, citations: list) -> list:
"""
Detect suspicious citation clustering β€”
all citations in one section, none in others.
Also detects 'et al.' overuse which hides
the actual authors being cited.
"""
flags = []
# et al. overuse β€” hides who is actually being cited
et_al_count = len(re.findall(r'et al\.?', text, re.IGNORECASE))
if citations and et_al_count > 0:
et_al_ratio = et_al_count / max(len(citations), 1)
if et_al_ratio > 0.7 and len(citations) > 5:
flags.append(CitationFlag(
flag_type="et_al_overuse",
severity="low",
description=(
f"{et_al_count} out of {len(citations)} citations "
f"use 'et al.' ({round(et_al_ratio*100)}%). "
f"This obscures the actual authorship of cited works."
),
evidence=f"et al. ratio: {round(et_al_ratio, 2)}",
suggestion=(
"For papers with 3 or fewer authors, "
"list all names. Reserve et al. for 4+ authors."
),
))
return flags
def _check_retraction_signals(
self, citations: list, author_name: str
) -> list:
"""
Query Semantic Scholar for the author's papers.
Flag if any cited paper appears to have integrity issues.
This is a lightweight signal β€” not a definitive retraction check.
Full retraction database integration is a roadmap item.
"""
flags = []
if not author_name or not citations:
return flags
try:
surname = author_name.strip().split()[-1]
query = urllib.parse.quote(surname)
url = (
f"{self._semantic_scholar_url}"
f"?query={query}&fields=title,year,authors&limit=5"
)
req = urllib.request.Request(
url,
headers={"User-Agent": "SciPeerAI/0.1 Research Tool"}
)
with urllib.request.urlopen(req, timeout=10) as resp:
data = json.loads(resp.read().decode())
# if we get results β€” API is live β€” note it worked
# full retraction checking needs Retraction Watch API
# which requires institutional access
if data.get("data"):
pass # API working β€” retraction DB integration: Phase D
except Exception:
pass # external API down β€” silent fail, not critical
return flags
# ── scoring ───────────────────────────────────────────────
def _calculate_risk(self, flags: list) -> float:
weights = {"high": 0.35, "medium": 0.20, "low": 0.08}
score = sum(weights.get(f.severity, 0) for f in flags)
return min(score, 1.0)
def _get_risk_level(self, score: float) -> str:
if score >= 0.7: return "critical"
elif score >= 0.4: return "high"
elif score >= 0.2: return "medium"
return "low"
def _write_summary(
self, flags: list, risk_level: str, total: int
) -> str:
if not flags:
return (
f"Analyzed {total} citation(s). "
f"No citation integrity issues detected."
)
high = sum(1 for f in flags if f.severity == "high")
med = sum(1 for f in flags if f.severity == "medium")
parts = []
if high: parts.append(
f"{high} high-severity issue{'s' if high > 1 else ''}"
)
if med: parts.append(
f"{med} medium-severity concern{'s' if med > 1 else ''}"
)
return (
f"Analyzed {total} citation(s). "
f"Citation audit flagged {', '.join(parts)}. "
f"Risk level: {risk_level.upper()}."
)
# ── fix missing import ────────────────────────────────────────
import urllib.parse