BibGuard / src /checkers /formatting_checker.py
thinkwee
init
46df5f0
"""
Formatting checker.
Validates common LaTeX formatting issues:
- Citation formatting consistency
- Non-breaking spaces before citations
- Special character escaping
- Whitespace issues
"""
import re
from typing import List
from .base import BaseChecker, CheckResult, CheckSeverity
class FormattingChecker(BaseChecker):
"""Check for common LaTeX formatting issues."""
name = "formatting"
display_name = "Formatting"
description = "Check citation style, spacing, and special characters"
# Citation commands
CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp',
'citeauthor', 'citeyear', 'autocite', 'textcite',
'parencite', 'footcite']
# Pattern for citations without non-breaking space
# Matches: "word \cite" but not "word~\cite"
CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
# Pattern for multiple consecutive spaces
MULTI_SPACE_PATTERN = re.compile(r'(?<!\\) +')
# Pattern for unescaped special characters (outside math mode)
SPECIAL_CHARS = {
'%': r'(?<!\\)%', # Unescaped %
'&': r'(?<!\\)&(?![a-zA-Z]+;)', # Unescaped & (not HTML entities)
'#': r'(?<!\\)#', # Unescaped #
'_': r'(?<![\\$])_(?![^$]*\$)', # Unescaped _ outside math
'^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
}
# Multiple blank lines pattern (3 or more blank lines)
MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
lines = tex_content.split('\n')
# Track citation style consistency
cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
for line_num, line in enumerate(lines, 1):
# Skip commented lines using base class method
if self._is_comment_line(line):
continue
# Remove inline comments using base class method
line_content = self._remove_line_comment(line)
# Check citation non-breaking space
for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message="Citation without non-breaking space",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
))
# Track citation styles
for cmd in self.CITE_COMMANDS:
if re.search(rf'\\{cmd}\b', line_content):
if cmd in ['citep', 'parencite', 'autocite']:
cite_styles['parenthetical'] += 1
elif cmd in ['citet', 'textcite']:
cite_styles['textual'] += 1
elif cmd == 'cite':
cite_styles['plain'] += 1
# Check citation style consistency
styles_used = [s for s, count in cite_styles.items() if count > 0]
if len(styles_used) > 1:
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Mixed citation styles detected: {', '.join(styles_used)}",
suggestion="Consider using consistent citation style throughout"
))
# Check for multiple blank lines (3 or more)
for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
line_num = self._find_line_number(tex_content, match.start())
# Count how many blank lines
blank_count = match.group(0).count('\n') - 1
# Get context: the line before, blank lines, and the line after
start_pos = match.start()
end_pos = match.end()
# Find the line before the blank lines
prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
prev_line_end = start_pos
prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
# Find the line after the blank lines
next_line_end = tex_content.find('\n', end_pos)
if next_line_end == -1:
next_line_end = len(tex_content)
next_line = tex_content[end_pos:next_line_end].rstrip()
# Create visual representation with warning markers
blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
line_number=line_num,
line_content=line_content,
suggestion="Reduce to single blank line or use \\vspace"
))
# Check for common issues with special characters
results.extend(self._check_special_chars(tex_content, lines))
return results
def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
"""Check for unescaped special characters."""
results = []
# Find math environments to skip
math_regions = self._find_math_regions(content)
for line_num, line in enumerate(lines, 1):
# Skip commented lines using base class method
if self._is_comment_line(line):
continue
# Remove inline comments using base class method
line_content = self._remove_line_comment(line)
# Get position of this line in full content
line_start = sum(len(l) + 1 for l in lines[:line_num-1])
# Check for unescaped & (common error)
for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content):
pos = line_start + match.start()
# Skip if in math
if not self._in_math_region(pos, math_regions):
# Also skip if inside tabular
if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message="Unescaped & outside tabular/math environment",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Use \\& to escape"
))
return results
def _find_math_regions(self, content: str) -> List[tuple]:
"""Find regions that are inside math mode."""
regions = []
# Inline math $ ... $
for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL):
regions.append((match.start(), match.end()))
# Display math $$ ... $$
for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL):
regions.append((match.start(), match.end()))
# \[ ... \]
for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL):
regions.append((match.start(), match.end()))
# Math environments
for env in ['equation', 'align', 'gather', 'multline', 'displaymath']:
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
for match in re.finditer(pattern, content, re.DOTALL):
regions.append((match.start(), match.end()))
return regions
def _in_math_region(self, pos: int, regions: List[tuple]) -> bool:
"""Check if position is inside a math region."""
return any(start <= pos <= end for start, end in regions)
def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
"""Check if position is inside any of the given environments."""
for env in env_names:
# Find all instances of this environment
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
for match in re.finditer(pattern, content, re.DOTALL):
if match.start() <= pos <= match.end():
return True
return False