BibGuard / src /checkers /acronym_checker.py
thinkwee
Fix: Enhance acronym detection by supporting `~` in definitions and reducing false positives for undefined or pre-defined usages on the same line.
4a7a641
"""
Acronym and abbreviation checker.
Validates that:
- Acronyms found in text have corresponding full forms defined
- Acronyms are used after their definition
- Only checks acronyms that have matching full forms in the document
"""
import re
from typing import List, Dict, Set, Tuple
from collections import defaultdict
from .base import BaseChecker, CheckResult, CheckSeverity
class AcronymChecker(BaseChecker):
"""Check acronym definitions and consistency."""
name = "acronym"
display_name = "Acronyms"
description = "Check acronym definitions and consistent usage"
# Enhanced pattern to find defined acronyms with LaTeX formatting support
# Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
DEFINITION_PATTERN = re.compile(
r'([A-Z][a-zA-Z\s\-]+)[\s~]*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|' # Full Name (ABC) or Full Name (\textbf{ABC}) with optional ~
r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)', # (ABC; Full Name) or (\textbf{ABC}; Full Name)
re.MULTILINE
)
# Pattern to find standalone acronyms (3+ capital letters)
ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')
# Comprehensive list of common acronyms that don't need definition
COMMON_ACRONYMS = {
# Hardware & Computing
'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',
# File Formats & Standards
'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',
# AI & Machine Learning (General)
'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',
# NLP & Language Models
'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',
# Computer Vision
'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',
# Reinforcement Learning
'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
'POMDP', 'RLHF', 'RLAIF',
# Metrics & Evaluation
'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',
# Data & Statistics
'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',
# Academic & Organizations
'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',
# Methods & Techniques (Common in ML papers)
'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',
# Misc
'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
}
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
# Remove comments using base class method
content = self._remove_comments(tex_content)
# Find all defined acronyms with their positions
defined_acronyms = self._find_definitions(content)
# Find all acronym usages (excluding special contexts)
all_usages = self._find_all_usages(content)
# NEW: Find potential full forms for each acronym
acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())
# Check for undefined acronyms (only those with matching full forms)
for acronym, positions in all_usages.items():
if acronym in self.COMMON_ACRONYMS:
continue
# Skip if no matching full form found in document
if acronym not in acronym_full_forms:
continue
if acronym not in defined_acronyms:
# First usage should define it
first_pos = positions[0]
line_num = self._find_line_number(content, first_pos)
full_form = acronym_full_forms[acronym]
# Check if full form is present in the same line (loose definition check)
# This handles cases like: "The Unified Modeling Language (UML) is..." where regex missed it
# or "UML (Unified Modeling Language)" or just "Unified Modeling Language ... UML"
line_content = self._get_line_content(content, line_num)
# Check if full form is in line content (ignoring case)
if full_form.lower() in line_content.lower():
continue
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
line_number=line_num,
suggestion=f"Define on first use: '{full_form} ({acronym})'"
))
else:
# Check if used before definition
def_pos = defined_acronyms[acronym]
# Get the line number of the definition
def_line_num = self._find_line_number(content, def_pos)
for pos in positions:
if pos < def_pos:
line_num = self._find_line_number(content, pos)
# Special case: if usage is on the same line as definition, it might be the definition itself
# (e.g. if the regex matched slightly later than the acronym usage starts?)
# But typically DEFINITION_PATTERN captures the whole block.
# However, if we have "The Unified Modeling Language (UML)..." and usage finds "UML"
# technically "UML" inside "(UML)" is usage?
# `_find_all_usages` excludes special contexts like `(ACRONYM)`.
# So if we are here, it's a usage outside of parens.
# If usage is on the same line as definition, let's look closer.
if line_num == def_line_num:
# It's likely fine if on same line
continue
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message=f"Acronym '{acronym}' used before definition",
line_number=line_num,
suggestion="Move definition before first use"
))
break
return results
def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
"""Find potential full forms for acronyms by matching capital letters."""
full_forms = {}
for acronym in acronyms:
if acronym in self.COMMON_ACRONYMS:
continue
# Build regex pattern to match full form
# For "ABC", match words starting with A, B, C
acronym_clean = acronym.rstrip('s') # Remove plural
if len(acronym_clean) < 3:
continue
# Create pattern: match sequence of words where first letters spell the acronym
# Allow optional words in between (like "of", "the", "and")
pattern_parts = []
for i, letter in enumerate(acronym_clean):
if i == 0:
# First word must start with the letter
pattern_parts.append(f'{letter}[a-z]+')
else:
# Subsequent words: allow optional filler words
pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+')
full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'
try:
matches = re.finditer(full_pattern, content, re.IGNORECASE)
for match in matches:
candidate = match.group(0)
# Skip if candidate contains common non-content words
# These words indicate the match is part of a sentence, not an acronym full form
excluded_words = {
'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
'can', 'could', 'may', 'might', 'must', 'shall',
'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
'more', 'most', 'less', 'few', 'several', 'other', 'another'
}
candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
if any(word in excluded_words for word in candidate_words):
continue
# Verify: extract first letters and check if they match acronym
words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
# Filter out filler words (allowed in between but not counted)
filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
meaningful_words = [w for w in words if w.lower() not in filler_words]
if len(meaningful_words) >= len(acronym_clean):
first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
if first_letters == acronym_clean:
full_forms[acronym] = candidate
break # Found a match, use the first one
except re.error:
# Invalid regex, skip this acronym
continue
return full_forms
def _find_definitions(self, content: str) -> Dict[str, int]:
"""Find all acronym definitions and their positions."""
definitions = {}
for match in self.DEFINITION_PATTERN.finditer(content):
# Get acronym from either pattern
acronym = match.group(2) or match.group(3)
if acronym:
acronym = acronym.rstrip('s') # Remove plural
definitions[acronym] = match.start()
return definitions
def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
"""Find all acronym usages, excluding special contexts."""
usages = defaultdict(list)
for match in self.ACRONYM_PATTERN.finditer(content):
acronym = match.group(1).rstrip('s')
pos = match.start()
# Skip if in special context
if self._is_in_special_context(content, pos, acronym):
continue
usages[acronym].append(pos)
return usages
def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
"""Check if acronym at position is in a special context that should be ignored."""
# Get surrounding context
start = max(0, pos - 50)
end = min(len(content), pos + len(acronym) + 50)
before = content[start:pos]
after = content[pos + len(acronym):end]
# Skip if inside definition parentheses: (ACRONYM)
if before.endswith('(') and after.startswith(')'):
return True
# Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
if before.rstrip().endswith('\\'):
return True
# Skip if inside label: \label{...:ACRONYM...}
if r'\label{' in before[-20:] and '}' in after[:20]:
return True
# Skip if inside ref: \ref{...:ACRONYM...}
if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]):
return True
# Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
if r'\url{' in before[-20:] or 'http' in before[-20:]:
return True
# Skip if inside math mode (simple heuristic)
# Count $ signs before position
dollar_count = before.count('$') - before.count(r'\$')
if dollar_count % 2 == 1: # Odd number means we're inside math mode
return True
# Skip if inside \begin{equation} or similar
if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]):
return True
# Skip if it looks like a LaTeX command argument: \command[ACRONYM]
if before.endswith('[') and after.startswith(']'):
return True
# Skip if part of a file path or extension
if '.' in before[-5:] or '/' in before[-10:]:
return True
return False