|
|
""" |
|
|
Terminology consistency checker. |
|
|
|
|
|
Validates: |
|
|
- Consistent spelling of the same term |
|
|
- Consistent hyphenation |
|
|
- Consistent capitalization of technical terms |
|
|
""" |
|
|
import re |
|
|
from typing import List, Dict, Set |
|
|
from collections import defaultdict |
|
|
|
|
|
from .base import BaseChecker, CheckResult, CheckSeverity |
|
|
|
|
|
|
|
|
class ConsistencyChecker(BaseChecker): |
|
|
"""Check terminology and spelling consistency.""" |
|
|
|
|
|
name = "consistency" |
|
|
display_name = "Consistency" |
|
|
description = "Check for inconsistent terminology and spelling" |
|
|
|
|
|
|
|
|
KNOWN_VARIANTS = { |
|
|
|
|
|
'self-supervised': ['self supervised', 'selfsupervised'], |
|
|
'pre-trained': ['pre trained', 'pretrained'], |
|
|
'fine-tuned': ['fine tuned', 'finetuned'], |
|
|
'state-of-the-art': ['state of the art', 'stateoftheart'], |
|
|
'real-world': ['real world', 'realworld'], |
|
|
'end-to-end': ['end to end', 'endtoend', 'e2e'], |
|
|
'large-scale': ['large scale', 'largescale'], |
|
|
'long-term': ['long term', 'longterm'], |
|
|
'short-term': ['short term', 'shortterm'], |
|
|
'multi-task': ['multi task', 'multitask'], |
|
|
'multi-modal': ['multi modal', 'multimodal'], |
|
|
'cross-lingual': ['cross lingual', 'crosslingual'], |
|
|
'zero-shot': ['zero shot', 'zeroshot'], |
|
|
'few-shot': ['few shot', 'fewshot'], |
|
|
'in-context': ['in context', 'incontext'], |
|
|
|
|
|
|
|
|
|
|
|
'color': ['colour'], |
|
|
'behavior': ['behaviour'], |
|
|
'favor': ['favour'], |
|
|
'honor': ['honour'], |
|
|
'labor': ['labour'], |
|
|
'neighbor': ['neighbour'], |
|
|
'rumor': ['rumour'], |
|
|
'vapor': ['vapour'], |
|
|
|
|
|
|
|
|
'analyze': ['analyse'], |
|
|
'characterize': ['characterise'], |
|
|
'generalize': ['generalise'], |
|
|
'initialize': ['initialise'], |
|
|
'maximize': ['maximise'], |
|
|
'minimize': ['minimise'], |
|
|
'normalize': ['normalise'], |
|
|
'optimize': ['optimise'], |
|
|
'organize': ['organise'], |
|
|
'realize': ['realise'], |
|
|
'recognize': ['recognise'], |
|
|
'specialize': ['specialise'], |
|
|
'standardize': ['standardise'], |
|
|
'summarize': ['summarise'], |
|
|
'utilize': ['utilise'], |
|
|
'visualize': ['visualise'], |
|
|
'categorize': ['categorise'], |
|
|
'emphasize': ['emphasise'], |
|
|
'hypothesize': ['hypothesise'], |
|
|
'prioritize': ['prioritise'], |
|
|
'synchronize': ['synchronise'], |
|
|
|
|
|
|
|
|
'generalization': ['generalisation'], |
|
|
'initialization': ['initialisation'], |
|
|
'maximization': ['maximisation'], |
|
|
'minimization': ['minimisation'], |
|
|
'normalization': ['normalisation'], |
|
|
'optimization': ['optimisation'], |
|
|
'organization': ['organisation'], |
|
|
'realization': ['realisation'], |
|
|
'regularization': ['regularisation'], |
|
|
'specialization': ['specialisation'], |
|
|
'standardization': ['standardisation'], |
|
|
'summarization': ['summarisation'], |
|
|
'utilization': ['utilisation'], |
|
|
'visualization': ['visualisation'], |
|
|
'categorization': ['categorisation'], |
|
|
'characterization': ['characterisation'], |
|
|
'parametrization': ['parametrisation'], |
|
|
'quantization': ['quantisation'], |
|
|
|
|
|
|
|
|
'center': ['centre'], |
|
|
'fiber': ['fibre'], |
|
|
'meter': ['metre'], |
|
|
'liter': ['litre'], |
|
|
|
|
|
|
|
|
'modeling': ['modelling'], |
|
|
'labeled': ['labelled'], |
|
|
'labeling': ['labelling'], |
|
|
'traveled': ['travelled'], |
|
|
'traveling': ['travelling'], |
|
|
'canceled': ['cancelled'], |
|
|
'canceling': ['cancelling'], |
|
|
'signaled': ['signalled'], |
|
|
'signaling': ['signalling'], |
|
|
|
|
|
|
|
|
'analog': ['analogue'], |
|
|
'catalog': ['catalogue'], |
|
|
'dialog': ['dialogue'], |
|
|
|
|
|
|
|
|
'defense': ['defence'], |
|
|
'license': ['licence'], |
|
|
'offense': ['offence'], |
|
|
|
|
|
|
|
|
'gray': ['grey'], |
|
|
'artifact': ['artefact'], |
|
|
'program': ['programme'], |
|
|
'skeptical': ['sceptical'], |
|
|
'aluminum': ['aluminium'], |
|
|
|
|
|
|
|
|
'learned': ['learnt'], |
|
|
'burned': ['burnt'], |
|
|
'spelled': ['spelt'], |
|
|
|
|
|
|
|
|
'dataset': ['data set', 'data-set'], |
|
|
'benchmark': ['bench mark', 'bench-mark'], |
|
|
'baseline': ['base line', 'base-line'], |
|
|
'downstream': ['down stream', 'down-stream'], |
|
|
'upstream': ['up stream', 'up-stream'], |
|
|
'encoder': ['en-coder'], |
|
|
'decoder': ['de-coder'], |
|
|
} |
|
|
|
|
|
|
|
|
CAPITALIZATION_TERMS = [ |
|
|
'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn', |
|
|
'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu', |
|
|
] |
|
|
|
|
|
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: |
|
|
results = [] |
|
|
|
|
|
|
|
|
content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE) |
|
|
content_lower = content.lower() |
|
|
|
|
|
|
|
|
for canonical, variants in self.KNOWN_VARIANTS.items(): |
|
|
found_forms = [] |
|
|
|
|
|
|
|
|
if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE): |
|
|
found_forms.append(canonical) |
|
|
|
|
|
|
|
|
for variant in variants: |
|
|
if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE): |
|
|
found_forms.append(variant) |
|
|
|
|
|
if len(found_forms) > 1: |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.WARNING, |
|
|
message=f"Inconsistent spelling: {', '.join(found_forms)}", |
|
|
suggestion=f"Use '{canonical}' consistently throughout" |
|
|
)) |
|
|
|
|
|
|
|
|
hyphen_results = self._check_hyphenation_consistency(content) |
|
|
results.extend(hyphen_results) |
|
|
|
|
|
|
|
|
cap_results = self._check_capitalization_consistency(content) |
|
|
results.extend(cap_results) |
|
|
|
|
|
return results |
|
|
|
|
|
def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]: |
|
|
"""Find words that appear both hyphenated and non-hyphenated.""" |
|
|
results = [] |
|
|
|
|
|
|
|
|
ALWAYS_HYPHENATED = { |
|
|
'state-of-the-art', 'end-to-end', 'real-time', 'real-world', |
|
|
'fine-tuning', 'fine-grained', 'large-scale', 'small-scale', |
|
|
'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual', |
|
|
'self-supervised', 'self-attention', 'co-training', 'pre-training', |
|
|
'post-processing', 'pre-processing', 'well-known', 'well-defined', |
|
|
'high-quality', 'low-quality', 'long-term', 'short-term' |
|
|
} |
|
|
|
|
|
|
|
|
hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE)) |
|
|
|
|
|
for hyph_word in hyphenated: |
|
|
|
|
|
if hyph_word.lower() in ALWAYS_HYPHENATED: |
|
|
continue |
|
|
|
|
|
|
|
|
non_hyph = hyph_word.replace('-', ' ') |
|
|
combined = hyph_word.replace('-', '') |
|
|
|
|
|
|
|
|
if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'", |
|
|
suggestion="Choose one form and use it consistently" |
|
|
)) |
|
|
elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'", |
|
|
suggestion="Choose one form and use it consistently" |
|
|
)) |
|
|
|
|
|
return results |
|
|
|
|
|
def _check_capitalization_consistency(self, content: str) -> List[CheckResult]: |
|
|
"""Check if technical terms have consistent capitalization.""" |
|
|
results = [] |
|
|
|
|
|
for term in self.CAPITALIZATION_TERMS: |
|
|
|
|
|
pattern = re.compile(rf'\b{term}\b', re.IGNORECASE) |
|
|
matches = pattern.findall(content) |
|
|
|
|
|
if len(matches) > 1: |
|
|
|
|
|
unique_forms = set(matches) |
|
|
if len(unique_forms) > 1: |
|
|
forms_str = ', '.join(f"'{f}'" for f in unique_forms) |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message=f"Inconsistent capitalization: {forms_str}", |
|
|
suggestion="Use consistent capitalization for technical terms" |
|
|
)) |
|
|
|
|
|
return results |
|
|
|