Spaces:

thinkwee
/

BibGuard

Running

BibGuard / src /checkers /consistency_checker.py

thinkwee

init

46df5f0 17 days ago

9.86 kB

	"""
	Terminology consistency checker.

	Validates:
	- Consistent spelling of the same term
	- Consistent hyphenation
	- Consistent capitalization of technical terms
	"""
	import re
	from typing import List, Dict, Set
	from collections import defaultdict

	from .base import BaseChecker, CheckResult, CheckSeverity


	class ConsistencyChecker(BaseChecker):
	"""Check terminology and spelling consistency."""

	name = "consistency"
	display_name = "Consistency"
	description = "Check for inconsistent terminology and spelling"

	# Known variant pairs (canonical -> variants)
	KNOWN_VARIANTS = {
	# Hyphenation variants
	'self-supervised': ['self supervised', 'selfsupervised'],
	'pre-trained': ['pre trained', 'pretrained'],
	'fine-tuned': ['fine tuned', 'finetuned'],
	'state-of-the-art': ['state of the art', 'stateoftheart'],
	'real-world': ['real world', 'realworld'],
	'end-to-end': ['end to end', 'endtoend', 'e2e'],
	'large-scale': ['large scale', 'largescale'],
	'long-term': ['long term', 'longterm'],
	'short-term': ['short term', 'shortterm'],
	'multi-task': ['multi task', 'multitask'],
	'multi-modal': ['multi modal', 'multimodal'],
	'cross-lingual': ['cross lingual', 'crosslingual'],
	'zero-shot': ['zero shot', 'zeroshot'],
	'few-shot': ['few shot', 'fewshot'],
	'in-context': ['in context', 'incontext'],

	# American vs British English (comprehensive list)
	# -or/-our endings
	'color': ['colour'],
	'behavior': ['behaviour'],
	'favor': ['favour'],
	'honor': ['honour'],
	'labor': ['labour'],
	'neighbor': ['neighbour'],
	'rumor': ['rumour'],
	'vapor': ['vapour'],

	# -ize/-ise endings
	'analyze': ['analyse'],
	'characterize': ['characterise'],
	'generalize': ['generalise'],
	'initialize': ['initialise'],
	'maximize': ['maximise'],
	'minimize': ['minimise'],
	'normalize': ['normalise'],
	'optimize': ['optimise'],
	'organize': ['organise'],
	'realize': ['realise'],
	'recognize': ['recognise'],
	'specialize': ['specialise'],
	'standardize': ['standardise'],
	'summarize': ['summarise'],
	'utilize': ['utilise'],
	'visualize': ['visualise'],
	'categorize': ['categorise'],
	'emphasize': ['emphasise'],
	'hypothesize': ['hypothesise'],
	'prioritize': ['prioritise'],
	'synchronize': ['synchronise'],

	# -ization/-isation endings
	'generalization': ['generalisation'],
	'initialization': ['initialisation'],
	'maximization': ['maximisation'],
	'minimization': ['minimisation'],
	'normalization': ['normalisation'],
	'optimization': ['optimisation'],
	'organization': ['organisation'],
	'realization': ['realisation'],
	'regularization': ['regularisation'],
	'specialization': ['specialisation'],
	'standardization': ['standardisation'],
	'summarization': ['summarisation'],
	'utilization': ['utilisation'],
	'visualization': ['visualisation'],
	'categorization': ['categorisation'],
	'characterization': ['characterisation'],
	'parametrization': ['parametrisation'],
	'quantization': ['quantisation'],

	# -er/-re endings
	'center': ['centre'],
	'fiber': ['fibre'],
	'meter': ['metre'],
	'liter': ['litre'],

	# -l-/-ll- (American single, British double)
	'modeling': ['modelling'],
	'labeled': ['labelled'],
	'labeling': ['labelling'],
	'traveled': ['travelled'],
	'traveling': ['travelling'],
	'canceled': ['cancelled'],
	'canceling': ['cancelling'],
	'signaled': ['signalled'],
	'signaling': ['signalling'],

	# -og/-ogue endings
	'analog': ['analogue'],
	'catalog': ['catalogue'],
	'dialog': ['dialogue'],

	# -ense/-ence endings
	'defense': ['defence'],
	'license': ['licence'],
	'offense': ['offence'],

	# Other common differences
	'gray': ['grey'],
	'artifact': ['artefact'],
	'program': ['programme'], # Note: 'program' is standard in computing
	'skeptical': ['sceptical'],
	'aluminum': ['aluminium'],

	# Verb forms
	'learned': ['learnt'],
	'burned': ['burnt'],
	'spelled': ['spelt'],

	# Common term variants
	'dataset': ['data set', 'data-set'],
	'benchmark': ['bench mark', 'bench-mark'],
	'baseline': ['base line', 'base-line'],
	'downstream': ['down stream', 'down-stream'],
	'upstream': ['up stream', 'up-stream'],
	'encoder': ['en-coder'],
	'decoder': ['de-coder'],
	}

	# Capitalization variants to track
	CAPITALIZATION_TERMS = [
	'transformer', 'attention', 'bert', 'gpt', 'lstm', 'cnn', 'rnn',
	'encoder', 'decoder', 'embedding', 'softmax', 'sigmoid', 'relu',
	]

	def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
	results = []

	# Remove comments
	content = re.sub(r'(?<!\\)%.*$', '', tex_content, flags=re.MULTILINE)
	content_lower = content.lower()

	# Check for known variant inconsistencies
	for canonical, variants in self.KNOWN_VARIANTS.items():
	found_forms = []

	# Check canonical form
	if re.search(rf'\b{re.escape(canonical)}\b', content, re.IGNORECASE):
	found_forms.append(canonical)

	# Check variants
	for variant in variants:
	if re.search(rf'\b{re.escape(variant)}\b', content, re.IGNORECASE):
	found_forms.append(variant)

	if len(found_forms) > 1:
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.WARNING,
	message=f"Inconsistent spelling: {', '.join(found_forms)}",
	suggestion=f"Use '{canonical}' consistently throughout"
	))

	# Check hyphenated word consistency
	hyphen_results = self._check_hyphenation_consistency(content)
	results.extend(hyphen_results)

	# Check capitalization consistency
	cap_results = self._check_capitalization_consistency(content)
	results.extend(cap_results)

	return results

	def _check_hyphenation_consistency(self, content: str) -> List[CheckResult]:
	"""Find words that appear both hyphenated and non-hyphenated."""
	results = []

	# Common terms that should always be hyphenated (exceptions)
	ALWAYS_HYPHENATED = {
	'state-of-the-art', 'end-to-end', 'real-time', 'real-world',
	'fine-tuning', 'fine-grained', 'large-scale', 'small-scale',
	'multi-task', 'multi-modal', 'cross-domain', 'cross-lingual',
	'self-supervised', 'self-attention', 'co-training', 'pre-training',
	'post-processing', 'pre-processing', 'well-known', 'well-defined',
	'high-quality', 'low-quality', 'long-term', 'short-term'
	}

	# Find all hyphenated words
	hyphenated = set(re.findall(r'\b([a-z]+-[a-z]+(?:-[a-z]+)*)\b', content, re.IGNORECASE))

	for hyph_word in hyphenated:
	# Skip if it's a known compound that should always be hyphenated
	if hyph_word.lower() in ALWAYS_HYPHENATED:
	continue

	# Create non-hyphenated version
	non_hyph = hyph_word.replace('-', ' ')
	combined = hyph_word.replace('-', '')

	# Check if non-hyphenated version exists
	if re.search(rf'\b{re.escape(non_hyph)}\b', content, re.IGNORECASE):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=f"Inconsistent hyphenation: '{hyph_word}' vs '{non_hyph}'",
	suggestion="Choose one form and use it consistently"
	))
	elif re.search(rf'\b{re.escape(combined)}\b', content, re.IGNORECASE):
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=f"Inconsistent hyphenation: '{hyph_word}' vs '{combined}'",
	suggestion="Choose one form and use it consistently"
	))

	return results

	def _check_capitalization_consistency(self, content: str) -> List[CheckResult]:
	"""Check if technical terms have consistent capitalization."""
	results = []

	for term in self.CAPITALIZATION_TERMS:
	# Find all case variations
	pattern = re.compile(rf'\b{term}\b', re.IGNORECASE)
	matches = pattern.findall(content)

	if len(matches) > 1:
	# Check if there are mixed capitalizations
	unique_forms = set(matches)
	if len(unique_forms) > 1:
	forms_str = ', '.join(f"'{f}'" for f in unique_forms)
	results.append(self._create_result(
	passed=False,
	severity=CheckSeverity.INFO,
	message=f"Inconsistent capitalization: {forms_str}",
	suggestion="Use consistent capitalization for technical terms"
	))

	return results