BibGuard / src /checkers /citation_quality_checker.py
thinkwee
init
46df5f0
"""
Citation quality checker.
Validates:
- Old citations (>30 years) that might need updating
- Citation formatting patterns (et al., hardcoded citations, etc.)
"""
import re
from typing import List, Dict
from datetime import datetime
from collections import defaultdict
from .base import BaseChecker, CheckResult, CheckSeverity
class CitationQualityChecker(BaseChecker):
"""Check citation quality and balance."""
name = "citation_quality"
display_name = "Citation Quality"
description = "Check citation age, balance, and formatting"
# Thresholds
OLD_CITATION_YEARS = 30 # Citations older than this get flagged
CURRENT_YEAR = datetime.now().year
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
# This checker works best with bib content, but we can do some analysis
# on the tex file alone by looking at citation patterns
# Check for inline year citations that are old
old_cite_results = self._check_old_citations_in_text(tex_content)
results.extend(old_cite_results)
# Check for citation formatting issues
format_results = self._check_citation_formatting(tex_content)
results.extend(format_results)
return results
def _check_old_citations_in_text(self, content: str) -> List[CheckResult]:
"""Look for citations with old years visible in text."""
results = []
lines = content.split('\n')
# Pattern for citations with year, like "Smith et al. (2010)" or "(Smith, 2010)"
year_pattern = re.compile(
r'(?:\([^)]*(?:19[89]\d|20[01]\d)[^)]*\)|' # Parenthetical
r'\b(?:19[89]\d|20[01]\d)\b)', # Standalone year
re.IGNORECASE
)
old_years_found = set()
for line_num, line in enumerate(lines, 1):
# Skip comments using base class method
if self._is_comment_line(line):
continue
for match in year_pattern.finditer(line):
year_str = re.search(r'(19[89]\d|20[01]\d)', match.group())
if year_str:
year = int(year_str.group())
age = self.CURRENT_YEAR - year
if age >= self.OLD_CITATION_YEARS and year not in old_years_found:
old_years_found.add(year)
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Citation from {year} ({age} years old)",
line_number=line_num,
suggestion=f"Consider if there's more recent work on this topic"
))
return results
def _check_citation_formatting(self, content: str) -> List[CheckResult]:
"""Check for common citation formatting issues."""
results = []
lines = content.split('\n')
for line_num, line in enumerate(lines, 1):
if line.lstrip().startswith('%'):
continue
# Check for "et al" without period
if re.search(r'\bet al\b(?!\.)', line):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message="'et al' should be 'et al.'",
line_number=line_num,
suggestion="Add period after 'et al.'"
))
# Check for "[1]" style citations (might want natbib style)
# Skip if it's a command definition or argument
if re.search(r'\[\d+\]', line):
# Skip if in command definition
if '\\newcommand' in line or '\\renewcommand' in line or '\\def' in line:
continue
# Skip if it's clearly a command argument like [1] in \newcommand{\foo}[1]
if re.search(r'\\[a-zA-Z]+\[\d+\]', line):
continue
# Only flag if it looks like actual citation in text
if '\\cite' not in line and not re.search(r'\\[a-zA-Z]+\{', line[:20]):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message="Numeric citation style detected",
line_number=line_num,
suggestion="Consider author-year style for better readability"
))
# Check for hardcoded citations instead of \cite
if re.search(r'\([A-Z][a-z]+(?:\s+et\s+al\.?)?,?\s*\d{4}\)', line):
if '\\cite' not in line:
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message="Appears to be hardcoded citation instead of \\cite",
line_number=line_num,
line_content=line.strip()[:80],
suggestion="Use \\cite{} for proper bibliography management"
))
return results