thinkwee
Fix: Enhance acronym detection by supporting `~` in definitions and reducing false positives for undefined or pre-defined usages on the same line.
4a7a641
| """ | |
| Acronym and abbreviation checker. | |
| Validates that: | |
| - Acronyms found in text have corresponding full forms defined | |
| - Acronyms are used after their definition | |
| - Only checks acronyms that have matching full forms in the document | |
| """ | |
| import re | |
| from typing import List, Dict, Set, Tuple | |
| from collections import defaultdict | |
| from .base import BaseChecker, CheckResult, CheckSeverity | |
| class AcronymChecker(BaseChecker): | |
| """Check acronym definitions and consistency.""" | |
| name = "acronym" | |
| display_name = "Acronyms" | |
| description = "Check acronym definitions and consistent usage" | |
| # Enhanced pattern to find defined acronyms with LaTeX formatting support | |
| # Matches: "Full Name (ACRONYM)", "(ACRONYM; Full Name)", "Full Name (\textbf{ACRONYM})", etc. | |
| DEFINITION_PATTERN = re.compile( | |
| r'([A-Z][a-zA-Z\s\-]+)[\s~]*\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?\)|' # Full Name (ABC) or Full Name (\textbf{ABC}) with optional ~ | |
| r'\((?:\\(?:textbf|emph|textit|texttt)\{)?([A-Z]{3,}s?)(?:\})?;\s*([A-Za-z\s\-]+)\)', # (ABC; Full Name) or (\textbf{ABC}; Full Name) | |
| re.MULTILINE | |
| ) | |
| # Pattern to find standalone acronyms (3+ capital letters) | |
| ACRONYM_PATTERN = re.compile(r'\b([A-Z]{3,}s?)\b') | |
| # Comprehensive list of common acronyms that don't need definition | |
| COMMON_ACRONYMS = { | |
| # Hardware & Computing | |
| 'GPU', 'CPU', 'TPU', 'RAM', 'ROM', 'SSD', 'HDD', 'USB', 'BIOS', 'OS', | |
| 'API', 'SDK', 'IDE', 'GUI', 'CLI', 'URL', 'URI', 'DNS', 'IP', 'TCP', | |
| 'HTTP', 'HTTPS', 'FTP', 'SSH', 'SSL', 'TLS', 'VPN', 'LAN', 'WAN', | |
| # File Formats & Standards | |
| 'PDF', 'HTML', 'CSS', 'XML', 'JSON', 'YAML', 'CSV', 'TSV', 'SQL', | |
| 'UTF', 'ASCII', 'JPEG', 'PNG', 'GIF', 'SVG', 'MP3', 'MP4', 'ZIP', | |
| # AI & Machine Learning (General) | |
| 'AI', 'ML', 'DL', 'NN', 'ANN', 'DNN', 'CNN', 'RNN', 'LSTM', 'GRU', | |
| 'GAN', 'VAE', 'MLP', 'SVM', 'KNN', 'PCA', 'ICA', 'LDA', 'EM', | |
| 'SGD', 'ADAM', 'RMSPROP', 'ADAGRAD', 'LBFGS', | |
| # NLP & Language Models | |
| 'NLP', 'LLM', 'GPT', 'BERT', 'BART', 'T5', 'ELECTRA', 'ROBERTA', | |
| 'NER', 'POS', 'QA', 'MT', 'ASR', 'TTS', 'NMT', 'SMT', | |
| 'BLEU', 'ROUGE', 'METEOR', 'CIDEr', 'SPICE', 'WER', 'CER', | |
| # Computer Vision | |
| 'CV', 'OCR', 'YOLO', 'RCNN', 'SSD', 'FCN', 'UNET', 'RESNET', 'VGG', | |
| 'RGB', 'HSV', 'YUV', 'SIFT', 'SURF', 'ORB', 'HOG', 'SSIM', 'PSNR', | |
| # Reinforcement Learning | |
| 'RL', 'DQN', 'DDPG', 'PPO', 'A3C', 'TRPO', 'SAC', 'TD3', 'MDP', | |
| 'POMDP', 'RLHF', 'RLAIF', | |
| # Metrics & Evaluation | |
| 'F1', 'AUC', 'ROC', 'PR', 'MAP', 'NDCG', 'MRR', 'MSE', 'MAE', 'RMSE', | |
| 'MAPE', 'R2', 'IoU', 'AP', 'mAP', 'FPS', 'FLOPs', 'FLOPS', | |
| # Data & Statistics | |
| 'IID', 'OOD', 'KL', 'JS', 'EMD', 'MMD', 'ELBO', 'VI', 'MCMC', | |
| 'MLE', 'MAP', 'EM', 'GMM', 'HMM', 'CRF', 'MRF', | |
| # Academic & Organizations | |
| 'IEEE', 'ACM', 'AAAI', 'IJCAI', 'ICML', 'ICLR', 'NEURIPS', 'NIPS', | |
| 'ACL', 'EMNLP', 'NAACL', 'COLING', 'EACL', 'CVPR', 'ICCV', 'ECCV', | |
| 'SIGIR', 'KDD', 'WWW', 'CIKM', 'WSDM', 'ICDE', 'VLDB', 'SIGMOD', | |
| 'AAAI', 'IJCAI', 'AISTATS', 'UAI', 'COLT', 'ALT', | |
| # Methods & Techniques (Common in ML papers) | |
| 'SOTA', 'E2E', 'RAG', 'CoT', 'ToT', 'GoT', 'ICL', 'FSL', 'ZSL', | |
| 'PEFT', 'LORA', 'QLORA', 'SFT', 'DPO', 'SPIN', 'URPO', 'SPELL', | |
| 'STaR', 'ReST', 'RRHF', 'RAFT', 'LIMA', 'ORPO', | |
| # Misc | |
| 'USD', 'EUR', 'GBP', 'EU', 'US', 'UK', 'UN', 'NATO', 'NASA', | |
| 'ID', 'UID', 'UUID', 'MD5', 'SHA', 'AES', 'RSA', 'JWT', | |
| 'CRUD', 'REST', 'SOAP', 'RPC', 'AJAX', 'DOM', 'OOP', 'MVC', | |
| 'CI', 'CD', 'DevOps', 'AWS', 'GCP', 'GPU', 'NPU', 'ASIC', 'FPGA', | |
| } | |
| def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: | |
| results = [] | |
| # Remove comments using base class method | |
| content = self._remove_comments(tex_content) | |
| # Find all defined acronyms with their positions | |
| defined_acronyms = self._find_definitions(content) | |
| # Find all acronym usages (excluding special contexts) | |
| all_usages = self._find_all_usages(content) | |
| # NEW: Find potential full forms for each acronym | |
| acronym_full_forms = self._find_potential_full_forms(content, all_usages.keys()) | |
| # Check for undefined acronyms (only those with matching full forms) | |
| for acronym, positions in all_usages.items(): | |
| if acronym in self.COMMON_ACRONYMS: | |
| continue | |
| # Skip if no matching full form found in document | |
| if acronym not in acronym_full_forms: | |
| continue | |
| if acronym not in defined_acronyms: | |
| # First usage should define it | |
| first_pos = positions[0] | |
| line_num = self._find_line_number(content, first_pos) | |
| full_form = acronym_full_forms[acronym] | |
| # Check if full form is present in the same line (loose definition check) | |
| # This handles cases like: "The Unified Modeling Language (UML) is..." where regex missed it | |
| # or "UML (Unified Modeling Language)" or just "Unified Modeling Language ... UML" | |
| line_content = self._get_line_content(content, line_num) | |
| # Check if full form is in line content (ignoring case) | |
| if full_form.lower() in line_content.lower(): | |
| continue | |
| results.append(self._create_result( | |
| passed=False, | |
| severity=CheckSeverity.WARNING, | |
| message=f"Acronym '{acronym}' used without definition (possible full form: '{full_form}')", | |
| line_number=line_num, | |
| suggestion=f"Define on first use: '{full_form} ({acronym})'" | |
| )) | |
| else: | |
| # Check if used before definition | |
| def_pos = defined_acronyms[acronym] | |
| # Get the line number of the definition | |
| def_line_num = self._find_line_number(content, def_pos) | |
| for pos in positions: | |
| if pos < def_pos: | |
| line_num = self._find_line_number(content, pos) | |
| # Special case: if usage is on the same line as definition, it might be the definition itself | |
| # (e.g. if the regex matched slightly later than the acronym usage starts?) | |
| # But typically DEFINITION_PATTERN captures the whole block. | |
| # However, if we have "The Unified Modeling Language (UML)..." and usage finds "UML" | |
| # technically "UML" inside "(UML)" is usage? | |
| # `_find_all_usages` excludes special contexts like `(ACRONYM)`. | |
| # So if we are here, it's a usage outside of parens. | |
| # If usage is on the same line as definition, let's look closer. | |
| if line_num == def_line_num: | |
| # It's likely fine if on same line | |
| continue | |
| results.append(self._create_result( | |
| passed=False, | |
| severity=CheckSeverity.WARNING, | |
| message=f"Acronym '{acronym}' used before definition", | |
| line_number=line_num, | |
| suggestion="Move definition before first use" | |
| )) | |
| break | |
| return results | |
| def _find_potential_full_forms(self, content: str, acronyms: Set[str]) -> Dict[str, str]: | |
| """Find potential full forms for acronyms by matching capital letters.""" | |
| full_forms = {} | |
| for acronym in acronyms: | |
| if acronym in self.COMMON_ACRONYMS: | |
| continue | |
| # Build regex pattern to match full form | |
| # For "ABC", match words starting with A, B, C | |
| acronym_clean = acronym.rstrip('s') # Remove plural | |
| if len(acronym_clean) < 3: | |
| continue | |
| # Create pattern: match sequence of words where first letters spell the acronym | |
| # Allow optional words in between (like "of", "the", "and") | |
| pattern_parts = [] | |
| for i, letter in enumerate(acronym_clean): | |
| if i == 0: | |
| # First word must start with the letter | |
| pattern_parts.append(f'{letter}[a-z]+') | |
| else: | |
| # Subsequent words: allow optional filler words | |
| pattern_parts.append(f'(?:\\s+(?:of|the|and|for|in|on|with|to)\\s+)?\\s+{letter}[a-z]+') | |
| full_pattern = r'\b' + ''.join(pattern_parts) + r'\b' | |
| try: | |
| matches = re.finditer(full_pattern, content, re.IGNORECASE) | |
| for match in matches: | |
| candidate = match.group(0) | |
| # Skip if candidate contains common non-content words | |
| # These words indicate the match is part of a sentence, not an acronym full form | |
| excluded_words = { | |
| 'that', 'the', 'is', 'are', 'was', 'were', 'be', 'been', 'being', | |
| 'or', 'not', 'no', 'yes', 'if', 'but', 'as', 'at', 'by', 'from', | |
| 'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would', 'should', | |
| 'can', 'could', 'may', 'might', 'must', 'shall', | |
| 'this', 'these', 'those', 'such', 'which', 'what', 'who', 'when', 'where', | |
| 'how', 'why', 'all', 'each', 'every', 'some', 'any', 'many', 'much', | |
| 'more', 'most', 'less', 'few', 'several', 'other', 'another' | |
| } | |
| candidate_words = re.findall(r'\b[A-Za-z]+\b', candidate.lower()) | |
| if any(word in excluded_words for word in candidate_words): | |
| continue | |
| # Verify: extract first letters and check if they match acronym | |
| words = re.findall(r'\b[A-Z][a-z]+', candidate, re.IGNORECASE) | |
| # Filter out filler words (allowed in between but not counted) | |
| filler_words = {'of', 'and', 'for', 'in', 'on', 'with', 'to', 'a', 'an'} | |
| meaningful_words = [w for w in words if w.lower() not in filler_words] | |
| if len(meaningful_words) >= len(acronym_clean): | |
| first_letters = ''.join(w[0].upper() for w in meaningful_words[:len(acronym_clean)]) | |
| if first_letters == acronym_clean: | |
| full_forms[acronym] = candidate | |
| break # Found a match, use the first one | |
| except re.error: | |
| # Invalid regex, skip this acronym | |
| continue | |
| return full_forms | |
| def _find_definitions(self, content: str) -> Dict[str, int]: | |
| """Find all acronym definitions and their positions.""" | |
| definitions = {} | |
| for match in self.DEFINITION_PATTERN.finditer(content): | |
| # Get acronym from either pattern | |
| acronym = match.group(2) or match.group(3) | |
| if acronym: | |
| acronym = acronym.rstrip('s') # Remove plural | |
| definitions[acronym] = match.start() | |
| return definitions | |
| def _find_all_usages(self, content: str) -> Dict[str, List[int]]: | |
| """Find all acronym usages, excluding special contexts.""" | |
| usages = defaultdict(list) | |
| for match in self.ACRONYM_PATTERN.finditer(content): | |
| acronym = match.group(1).rstrip('s') | |
| pos = match.start() | |
| # Skip if in special context | |
| if self._is_in_special_context(content, pos, acronym): | |
| continue | |
| usages[acronym].append(pos) | |
| return usages | |
| def _is_in_special_context(self, content: str, pos: int, acronym: str) -> bool: | |
| """Check if acronym at position is in a special context that should be ignored.""" | |
| # Get surrounding context | |
| start = max(0, pos - 50) | |
| end = min(len(content), pos + len(acronym) + 50) | |
| before = content[start:pos] | |
| after = content[pos + len(acronym):end] | |
| # Skip if inside definition parentheses: (ACRONYM) | |
| if before.endswith('(') and after.startswith(')'): | |
| return True | |
| # Skip if inside LaTeX command: \ACRONYM or \command{ACRONYM} | |
| if before.rstrip().endswith('\\'): | |
| return True | |
| # Skip if inside label: \label{...:ACRONYM...} | |
| if r'\label{' in before[-20:] and '}' in after[:20]: | |
| return True | |
| # Skip if inside ref: \ref{...:ACRONYM...} | |
| if re.search(r'\\(?:ref|cite|autoref|cref|eqref)\{[^}]*$', before[-30:]): | |
| return True | |
| # Skip if inside URL: \url{...ACRONYM...} or http://...ACRONYM... | |
| if r'\url{' in before[-20:] or 'http' in before[-20:]: | |
| return True | |
| # Skip if inside math mode (simple heuristic) | |
| # Count $ signs before position | |
| dollar_count = before.count('$') - before.count(r'\$') | |
| if dollar_count % 2 == 1: # Odd number means we're inside math mode | |
| return True | |
| # Skip if inside \begin{equation} or similar | |
| if re.search(r'\\begin\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]): | |
| if not re.search(r'\\end\{(?:equation|align|gather|math|displaymath)\*?\}', before[-100:]): | |
| return True | |
| # Skip if it looks like a LaTeX command argument: \command[ACRONYM] | |
| if before.endswith('[') and after.startswith(']'): | |
| return True | |
| # Skip if part of a file path or extension | |
| if '.' in before[-5:] or '/' in before[-10:]: | |
| return True | |
| return False | |