Spaces:

thinkwee
/

BibGuard

Running

File size: 9,859 Bytes

46df5f0

"""
Terminology consistency checker.

Validates:
- Consistent spelling of the same term
- Consistent hyphenation
- Consistent capitalization of technical terms
"""
import re
from typing import List, Dict, Set
from collections import defaultdict

from .base import BaseChecker, CheckResult, CheckSeverity


class ConsistencyChecker(BaseChecker):
    """Check terminology and spelling consistency."""
    
    name = "consistency"
    display_name = "Consistency"
    description = "Check for inconsistent terminology and spelling"
    
    # Known variant pairs (canonical -> variants)
    KNOWN_VARIANTS = {
        # Hyphenation variants
        'self-supervised': ['self supervised', 'selfsupervised'],
        'pre-trained': ['pre trained', 'pretrained'],
        'fine-tuned': ['fine tuned', 'finetuned'],
        'state-of-the-art': ['state of the art', 'stateoftheart'],
        'real-world': ['real world', 'realworld'],
        'end-to-end': ['end to end', 'endtoend', 'e2e'],
        'large-scale': ['large scale', 'largescale'],
        'long-term': ['long term', 'longterm'],
        'short-term': ['short term', 'shortterm'],
        'multi-task': ['multi task', 'multitask'],
        'multi-modal': ['multi modal', 'multimodal'],
        'cross-lingual': ['cross lingual', 'crosslingual'],
        'zero-shot': ['zero shot', 'zeroshot'],
        'few-shot': ['few shot', 'fewshot'],
        'in-context': ['in context', 'incontext'],
        
        # American vs British English (comprehensive list)
        # -or/-our endings
        'color': ['colour'],
        'behavior': ['behaviour'],
        'favor': ['favour'],
        'honor': ['honour'],
        'labor': ['labour'],
        'neighbor': ['neighbour'],
        'rumor': ['rumour'],
        'vapor': ['vapour'],
        
        # -ize/-ise endings
        'analyze': ['analyse'],
        'characterize': ['characterise'],
        'generalize': ['generalise'],
        'initialize': ['initialise'],
        'maximize': ['maximise'],
        'minimize': ['minimise'],
        'normalize': ['normalise'],
        'optimize': ['optimise'],
        'organize': ['organise'],
        'realize': ['realise'],
        'recognize': ['recognise'],
        'specialize': ['specialise'],
        'standardize': ['standardise'],
        'summarize': ['summarise'],
        'utilize': ['utilise'],
        'visualize': ['visualise'],
        'categorize': ['categorise'],
        'emphasize': ['emphasise'],
        'hypothesize': ['hypothesise'],
        'prioritize': ['prioritise'],
        'synchronize': ['synchronise'],
        
        # -ization/-isation endings
        'generalization': ['generalisation'],
        'initialization': ['initialisation'],
        'maximization': ['maximisation'],
        'minimization': ['minimisation'],
        'normalization': ['normalisation'],
        'optimization': ['optimisation'],
        'organization': ['organisation'],
        'realization': ['realisation'],
        'regularization': ['regularisation'],
        'specialization': ['specialisation'],
        'standardization': ['standardisation'],
        'summarization': ['summarisation'],
        'utilization': ['utilisation'],
        'visualization': ['visualisation'],
        'categorization': ['categorisation'],
        'characterization': ['characterisation'],
        'parametrization': ['parametrisation'],
        'quantization': ['quantisation'],
        
        # -er/-re endings
        'center': ['centre'],
        'fiber': ['fibre'],
        'meter': ['metre'],
        'liter': ['litre'],
        
        # -l-/-ll- (American single, British double)
        'modeling': ['modelling'],
        'labeled': ['labelled'],
        'labeling': ['labelling'],
        'traveled': ['travelled'],
        'traveling': ['travelling'],
        'canceled': ['cancelled'],
        'canceling': ['cancelling'],
        'signaled': ['signalled'],
        'signaling': ['signalling'],
        
        # -og/-ogue endings
        'analog': ['analogue'],
        'catalog': ['catalogue'],
        'dialog': ['dialogue'],
        
        # -ense/-ence endings
        'defense': ['defence'],
        'license': ['licence'],
        'offense': ['offence'],
        
        # Other common differences
        'gray': ['grey'],
        'artifact': ['artefact'],
        'program': ['programme'],  # Note: 'program' is standard in computing
        'skeptical': ['sceptical'],
        'aluminum': ['aluminium'],
        
        # Verb forms
        'learned': ['learnt'],
        'burned': ['burnt'],
        'spelled': ['spelt'],
        
        # Common term variants
        'dataset': ['data set', 'data-set'],
        'benchmark': ['bench mark', 'bench-mark'],
        'baseline': ['base line', 'base-line'],
        'downstream': ['down stream', 'down-stream'],
        'upstream': ['up stream', 'up-stream'],
        'encoder': ['en-coder'],
        'decoder': ['de-coder'],
    }
    
    # Capitalization variants to track
    CAPITALIZATION_TERMS = [
        'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn',
        'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu',
    ]
    
    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
        results = []
        
        # Remove comments
        content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
        content_lower = content.lower()
        
        # Check for known variant inconsistencies
        for canonical, variants in self.KNOWN_VARIANTS.items():
            found_forms = []
            
            # Check canonical form
            if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
                found_forms.append(canonical)
            
            # Check variants
            for variant in variants:
                if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
                    found_forms.append(variant)
            
            if len(found_forms) > 1:
                results.append(self._create_result(
                    passed=False,
                    severity=CheckSeverity.WARNING,
                    message=f"Inconsistent spelling: {', '.join(found_forms)}",
                    suggestion=f"Use '{canonical}' consistently throughout"
                ))
        
        # Check hyphenated word consistency
        hyphen_results = self._check_hyphenation_consistency(content)
        results.extend(hyphen_results)
        
        # Check capitalization consistency
        cap_results = self._check_capitalization_consistency(content)
        results.extend(cap_results)
        
        return results
    
    def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]:
        """Find words that appear both hyphenated and non-hyphenated."""
        results = []
        
        # Common terms that should always be hyphenated (exceptions)
        ALWAYS_HYPHENATED = {
            'state-of-the-art', 'end-to-end', 'real-time', 'real-world',
            'fine-tuning', 'fine-grained', 'large-scale', 'small-scale',
            'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual',
            'self-supervised', 'self-attention', 'co-training', 'pre-training',
            'post-processing', 'pre-processing', 'well-known', 'well-defined',
            'high-quality', 'low-quality', 'long-term', 'short-term'
        }
        
        # Find all hyphenated words
        hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE))
        
        for hyph_word in hyphenated:
            # Skip if it's a known compound that should always be hyphenated
            if hyph_word.lower() in ALWAYS_HYPHENATED:
                continue
            
            # Create non-hyphenated version
            non_hyph = hyph_word.replace('-', ' ')
            combined = hyph_word.replace('-', '')
            
            # Check if non-hyphenated version exists
            if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE):
                results.append(self._create_result(
                    passed=False,
                    severity=CheckSeverity.INFO,
                    message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'",
                    suggestion="Choose one form and use it consistently"
                ))
            elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE):
                results.append(self._create_result(
                    passed=False,
                    severity=CheckSeverity.INFO,
                    message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'",
                    suggestion="Choose one form and use it consistently"
                ))
        
        return results
    
    def _check_capitalization_consistency(self, content: str) -> List[CheckResult]:
        """Check if technical terms have consistent capitalization."""
        results = []
        
        for term in self.CAPITALIZATION_TERMS:
            # Find all case variations
            pattern = re.compile(rf'\b{term}\b', re.IGNORECASE)
            matches = pattern.findall(content)
            
            if len(matches) > 1:
                # Check if there are mixed capitalizations
                unique_forms = set(matches)
                if len(unique_forms) > 1:
                    forms_str = ', '.join(f"'{f}'" for f in unique_forms)
                    results.append(self._create_result(
                        passed=False,
                        severity=CheckSeverity.INFO,
                        message=f"Inconsistent capitalization: {forms_str}",
                        suggestion="Use consistent capitalization for technical terms"
                    ))
        
        return results