Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /checkers /acronym_checker.py

thinkwee

Fix: Enhance acronym detection by supporting `~` in definitions and reducing false positives for undefined or pre-defined usages on the same line.

4a7a641 24 days ago

raw

history blame contribute delete

14.7 kB

	"""
	Acronym and abbreviation checker.

	Validates that:
	- Acronyms found in text have corresponding full forms defined
	- Acronyms are used after their definition
	- Only checks acronyms that have matching full forms in the document
	"""
	import re
	from typing import List, Dict, Set, Tuple
	from collections import defaultdict

	from .base import BaseChecker, CheckResult, CheckSeverity


	class AcronymChecker(BaseChecker):
	"""Check acronym definitions and consistency."""

	name = "acronym"
	display_name = "Acronyms"
	description = "Check acronym definitions and consistent usage"

	# Enhanced pattern to find defined acronyms with LaTeX formatting support
	# Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc.
	DEFINITION_PATTERN = re.compile(
	r'([A-Z][a-zA-Z\s\-]+)[\s~]*$(?:\\(?:textbf\|emph\|textit\|texttt)\{)?([A-Z]{3,}s?)(?:\})?$\|' # Full Name (ABC) or Full Name (\textbf{ABC}) with optional ~
	r'$(?:\\(?:textbf\|emph\|textit\|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)$', # (ABC; Full Name) or (\textbf{ABC}; Full Name)
	re.MULTILINE
	)

	# Pattern to find standalone acronyms (3+ capital letters)
	ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b')

	# Comprehensive list of common acronyms that don't need definition
	COMMON_ACRONYMS = {
	# Hardware & Computing
	'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS',
	'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP',
	'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN',

	# File Formats & Standards
	'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL',
	'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP',

	# AI & Machine Learning (General)
	'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU',
	'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM',
	'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS',

	# NLP & Language Models
	'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA',
	'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT',
	'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER',

	# Computer Vision
	'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG',
	'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR',

	# Reinforcement Learning
	'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP',
	'POMDP', 'RLHF', 'RLAIF',

	# Metrics & Evaluation
	'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE',
	'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS',

	# Data & Statistics
	'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC',
	'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF',

	# Academic & Organizations
	'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS',
	'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV',
	'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD',
	'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT',

	# Methods & Techniques (Common in ML papers)
	'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL',
	'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL',
	'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO',

	# Misc
	'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA',
	'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT',
	'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC',
	'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA',
	}

	def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
	results = []

	# Remove comments using base class method
	content = self._remove_comments(tex_content)

	# Find all defined acronyms with their positions
	defined_acronyms = self._find_definitions(content)

	# Find all acronym usages (excluding special contexts)
	all_usages = self._find_all_usages(content)

	# NEW: Find potential full forms for each acronym
	acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys())

	# Check for undefined acronyms (only those with matching full forms)
	for acronym, positions in all_usages.items():
	if acronym in self.COMMON_ACRONYMS:
	continue

	# Skip if no matching full form found in document
	if acronym not in acronym_full_forms:
	continue

	if acronym not in defined_acronyms:
	# First usage should define it
	first_pos = positions[0]
	line_num = self._find_line_number(content, first_pos)
	full_form = acronym_full_forms[acronym]

	# Check if full form is present in the same line (loose definition check)
	# This handles cases like: "The Unified Modeling Language (UML) is..." where regex missed it
	# or "UML (Unified Modeling Language)" or just "Unified Modeling Language ... UML"
	line_content = self._get_line_content(content, line_num)

	# Check if full form is in line content (ignoring case)
	if full_form.lower() in line_content.lower():
	continue

	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')",
	line_number=line_num,
	suggestion=f"Define on first use: '{full_form} ({acronym})'"
	))
	else:
	# Check if used before definition
	def_pos = defined_acronyms[acronym]

	# Get the line number of the definition
	def_line_num = self._find_line_number(content, def_pos)

	for pos in positions:
	if pos < def_pos:
	line_num = self._find_line_number(content, pos)

	# Special case: if usage is on the same line as definition, it might be the definition itself
	# (e.g. if the regex matched slightly later than the acronym usage starts?)
	# But typically DEFINITION_PATTERN captures the whole block.
	# However, if we have "The Unified Modeling Language (UML)..." and usage finds "UML"
	# technically "UML" inside "(UML)" is usage?
	# `_find_all_usages` excludes special contexts like `(ACRONYM)`.
	# So if we are here, it's a usage outside of parens.

	# If usage is on the same line as definition, let's look closer.
	if line_num == def_line_num:
	# It's likely fine if on same line
	continue

	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=f"Acronym '{acronym}' used before definition",
	line_number=line_num,
	suggestion="Move definition before first use"
	))
	break

	return results

	def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]:
	"""Find potential full forms for acronyms by matching capital letters."""
	full_forms = {}

	for acronym in acronyms:
	if acronym in self.COMMON_ACRONYMS:
	continue

	# Build regex pattern to match full form
	# For "ABC", match words starting with A, B, C
	acronym_clean = acronym.rstrip('s') # Remove plural
	if len(acronym_clean) < 3:
	continue

	# Create pattern: match sequence of words where first letters spell the acronym
	# Allow optional words in between (like "of", "the", "and")
	pattern_parts = []
	for i, letter in enumerate(acronym_clean):
	if i == 0:
	# First word must start with the letter
	pattern_parts.append(f'{letter}[a-z]+')
	else:
	# Subsequent words: allow optional filler words
	pattern_parts.append(f'(?:\\s+(?:of\|the\|and\|for\|in\|on\|with\|to)\\s+)?\\s+{letter}[a-z]+')

	full_pattern = r'\b' + ''.join(pattern_parts) + r'\b'

	try:
	matches = re.finditer(full_pattern, content, re.IGNORECASE)
	for match in matches:
	candidate = match.group(0)

	# Skip if candidate contains common non-content words
	# These words indicate the match is part of a sentence, not an acronym full form
	excluded_words = {
	'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
	'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from',
	'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
	'can', 'could', 'may', 'might', 'must', 'shall',
	'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where',
	'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much',
	'more', 'most', 'less', 'few', 'several', 'other', 'another'
	}

	candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower())
	if any(word in excluded_words for word in candidate_words):
	continue

	# Verify: extract first letters and check if they match acronym
	words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE)
	# Filter out filler words (allowed in between but not counted)
	filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'}
	meaningful_words = [w for w in words if w.lower() not in filler_words]

	if len(meaningful_words) >= len(acronym_clean):
	first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)])
	if first_letters == acronym_clean:
	full_forms[acronym] = candidate
	break # Found a match, use the first one
	except re.error:
	# Invalid regex, skip this acronym
	continue

	return full_forms

	def _find_definitions(self, content: str) -> Dict[str, int]:
	"""Find all acronym definitions and their positions."""
	definitions = {}

	for match in self.DEFINITION_PATTERN.finditer(content):
	# Get acronym from either pattern
	acronym = match.group(2) or match.group(3)
	if acronym:
	acronym = acronym.rstrip('s') # Remove plural
	definitions[acronym] = match.start()

	return definitions

	def _find_all_usages(self, content: str) -> Dict[str, List[int]]:
	"""Find all acronym usages, excluding special contexts."""
	usages = defaultdict(list)

	for match in self.ACRONYM_PATTERN.finditer(content):
	acronym = match.group(1).rstrip('s')
	pos = match.start()

	# Skip if in special context
	if self._is_in_special_context(content, pos, acronym):
	continue

	usages[acronym].append(pos)

	return usages

	def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool:
	"""Check if acronym at position is in a special context that should be ignored."""
	# Get surrounding context
	start = max(0, pos - 50)
	end = min(len(content), pos + len(acronym) + 50)
	before = content[start:pos]
	after = content[pos + len(acronym):end]

	# Skip if inside definition parentheses: (ACRONYM)
	if before.endswith('(') and after.startswith(')'):
	return True

	# Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM}
	if before.rstrip().endswith('\\'):
	return True

	# Skip if inside label: \label{...:ACRONYM...}
	if r'\label{' in before[-20:] and '}' in after[:20]:
	return True

	# Skip if inside ref: \ref{...:ACRONYM...}
	if re.search(r'\\(?:ref\|cite\|autoref\|cref\|eqref)\{[^}]*$', before[-30:]):
	return True

	# Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM...
	if r'\url{' in before[-20:] or 'http' in before[-20:]:
	return True

	# Skip if inside math mode (simple heuristic)
	# Count $ signs before position
	dollar_count = before.count('$') - before.count(r'\$')
	if dollar_count % 2 == 1: # Odd number means we're inside math mode
	return True

	# Skip if inside \begin{equation} or similar
	if re.search(r'\\begin\{(?:equation\|align\|gather\|math\|displaymath)\*?\}', before[-100:]):
	if not re.search(r'\\end\{(?:equation\|align\|gather\|math\|displaymath)\*?\}', before[-100:]):
	return True

	# Skip if it looks like a LaTeX command argument: \command[ACRONYM]
	if before.endswith('[') and after.startswith(']'):
	return True

	# Skip if part of a file path or extension
	if '.' in before[-5:] or '/' in before[-10:]:
	return True

	return False