|
|
""" |
|
|
Formatting checker. |
|
|
|
|
|
Validates common LaTeX formatting issues: |
|
|
- Citation formatting consistency |
|
|
- Non-breaking spaces before citations |
|
|
- Special character escaping |
|
|
- Whitespace issues |
|
|
""" |
|
|
import re |
|
|
from typing import List |
|
|
|
|
|
from .base import BaseChecker, CheckResult, CheckSeverity |
|
|
|
|
|
|
|
|
class FormattingChecker(BaseChecker): |
|
|
"""Check for common LaTeX formatting issues.""" |
|
|
|
|
|
name = "formatting" |
|
|
display_name = "Formatting" |
|
|
description = "Check citation style, spacing, and special characters" |
|
|
|
|
|
|
|
|
CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp', |
|
|
'citeauthor', 'citeyear', 'autocite', 'textcite', |
|
|
'parencite', 'footcite'] |
|
|
|
|
|
|
|
|
|
|
|
CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)') |
|
|
|
|
|
|
|
|
MULTI_SPACE_PATTERN = re.compile(r'(?<!\\) +') |
|
|
|
|
|
|
|
|
SPECIAL_CHARS = { |
|
|
'%': r'(?<!\\)%', |
|
|
'&': r'(?<!\\)&(?![a-zA-Z]+;)', |
|
|
'#': r'(?<!\\)#', |
|
|
'_': r'(?<![\\$])_(?![^$]*\$)', |
|
|
'^': r'(?<![\\$])\^(?![^$]*\$)', |
|
|
} |
|
|
|
|
|
|
|
|
MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n') |
|
|
|
|
|
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]: |
|
|
results = [] |
|
|
lines = tex_content.split('\n') |
|
|
|
|
|
|
|
|
cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0} |
|
|
|
|
|
for line_num, line in enumerate(lines, 1): |
|
|
|
|
|
if self._is_comment_line(line): |
|
|
continue |
|
|
|
|
|
|
|
|
line_content = self._remove_line_comment(line) |
|
|
|
|
|
|
|
|
for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message="Citation without non-breaking space", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')" |
|
|
)) |
|
|
|
|
|
|
|
|
for cmd in self.CITE_COMMANDS: |
|
|
if re.search(rf'\\{cmd}\b', line_content): |
|
|
if cmd in ['citep', 'parencite', 'autocite']: |
|
|
cite_styles['parenthetical'] += 1 |
|
|
elif cmd in ['citet', 'textcite']: |
|
|
cite_styles['textual'] += 1 |
|
|
elif cmd == 'cite': |
|
|
cite_styles['plain'] += 1 |
|
|
|
|
|
|
|
|
styles_used = [s for s, count in cite_styles.items() if count > 0] |
|
|
if len(styles_used) > 1: |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message=f"Mixed citation styles detected: {', '.join(styles_used)}", |
|
|
suggestion="Consider using consistent citation style throughout" |
|
|
)) |
|
|
|
|
|
|
|
|
for match in self.MULTI_BLANK_PATTERN.finditer(tex_content): |
|
|
line_num = self._find_line_number(tex_content, match.start()) |
|
|
|
|
|
blank_count = match.group(0).count('\n') - 1 |
|
|
|
|
|
|
|
|
start_pos = match.start() |
|
|
end_pos = match.end() |
|
|
|
|
|
|
|
|
prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1 |
|
|
prev_line_end = start_pos |
|
|
prev_line = tex_content[prev_line_start:prev_line_end].rstrip() |
|
|
|
|
|
|
|
|
next_line_end = tex_content.find('\n', end_pos) |
|
|
if next_line_end == -1: |
|
|
next_line_end = len(tex_content) |
|
|
next_line = tex_content[end_pos:next_line_end].rstrip() |
|
|
|
|
|
|
|
|
blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count) |
|
|
line_content = f"{prev_line}\n{blank_lines}\n{next_line}" |
|
|
|
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.INFO, |
|
|
message=f"Multiple blank lines ({blank_count} consecutive blank lines)", |
|
|
line_number=line_num, |
|
|
line_content=line_content, |
|
|
suggestion="Reduce to single blank line or use \\vspace" |
|
|
)) |
|
|
|
|
|
|
|
|
results.extend(self._check_special_chars(tex_content, lines)) |
|
|
|
|
|
return results |
|
|
|
|
|
def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]: |
|
|
"""Check for unescaped special characters.""" |
|
|
results = [] |
|
|
|
|
|
|
|
|
math_regions = self._find_math_regions(content) |
|
|
|
|
|
for line_num, line in enumerate(lines, 1): |
|
|
|
|
|
if self._is_comment_line(line): |
|
|
continue |
|
|
|
|
|
|
|
|
line_content = self._remove_line_comment(line) |
|
|
|
|
|
|
|
|
line_start = sum(len(l) + 1 for l in lines[:line_num-1]) |
|
|
|
|
|
|
|
|
for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content): |
|
|
pos = line_start + match.start() |
|
|
|
|
|
if not self._in_math_region(pos, math_regions): |
|
|
|
|
|
if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']): |
|
|
results.append(self._create_result( |
|
|
passed=False, |
|
|
severity=CheckSeverity.WARNING, |
|
|
message="Unescaped & outside tabular/math environment", |
|
|
line_number=line_num, |
|
|
line_content=line.strip()[:100], |
|
|
suggestion="Use \\& to escape" |
|
|
)) |
|
|
|
|
|
return results |
|
|
|
|
|
def _find_math_regions(self, content: str) -> List[tuple]: |
|
|
"""Find regions that are inside math mode.""" |
|
|
regions = [] |
|
|
|
|
|
|
|
|
for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL): |
|
|
regions.append((match.start(), match.end())) |
|
|
|
|
|
|
|
|
for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL): |
|
|
regions.append((match.start(), match.end())) |
|
|
|
|
|
|
|
|
for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL): |
|
|
regions.append((match.start(), match.end())) |
|
|
|
|
|
|
|
|
for env in ['equation', 'align', 'gather', 'multline', 'displaymath']: |
|
|
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}' |
|
|
for match in re.finditer(pattern, content, re.DOTALL): |
|
|
regions.append((match.start(), match.end())) |
|
|
|
|
|
return regions |
|
|
|
|
|
def _in_math_region(self, pos: int, regions: List[tuple]) -> bool: |
|
|
"""Check if position is inside a math region.""" |
|
|
return any(start <= pos <= end for start, end in regions) |
|
|
|
|
|
def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool: |
|
|
"""Check if position is inside any of the given environments.""" |
|
|
for env in env_names: |
|
|
|
|
|
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}' |
|
|
for match in re.finditer(pattern, content, re.DOTALL): |
|
|
if match.start() <= pos <= match.end(): |
|
|
return True |
|
|
return False |
|
|
|