File size: 8,718 Bytes
46df5f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
"""
Formatting checker.
Validates common LaTeX formatting issues:
- Citation formatting consistency
- Non-breaking spaces before citations
- Special character escaping
- Whitespace issues
"""
import re
from typing import List
from .base import BaseChecker, CheckResult, CheckSeverity
class FormattingChecker(BaseChecker):
"""Check for common LaTeX formatting issues."""
name = "formatting"
display_name = "Formatting"
description = "Check citation style, spacing, and special characters"
# Citation commands
CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp',
'citeauthor', 'citeyear', 'autocite', 'textcite',
'parencite', 'footcite']
# Pattern for citations without non-breaking space
# Matches: "word \cite" but not "word~\cite"
CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
# Pattern for multiple consecutive spaces
MULTI_SPACE_PATTERN = re.compile(r'(?<!\\) +')
# Pattern for unescaped special characters (outside math mode)
SPECIAL_CHARS = {
'%': r'(?<!\\)%', # Unescaped %
'&': r'(?<!\\)&(?![a-zA-Z]+;)', # Unescaped & (not HTML entities)
'#': r'(?<!\\)#', # Unescaped #
'_': r'(?<![\\$])_(?![^$]*\$)', # Unescaped _ outside math
'^': r'(?<![\\$])\^(?![^$]*\$)', # Unescaped ^ outside math
}
# Multiple blank lines pattern (3 or more blank lines)
MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
results = []
lines = tex_content.split('\n')
# Track citation style consistency
cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
for line_num, line in enumerate(lines, 1):
# Skip commented lines using base class method
if self._is_comment_line(line):
continue
# Remove inline comments using base class method
line_content = self._remove_line_comment(line)
# Check citation non-breaking space
for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message="Citation without non-breaking space",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
))
# Track citation styles
for cmd in self.CITE_COMMANDS:
if re.search(rf'\\{cmd}\b', line_content):
if cmd in ['citep', 'parencite', 'autocite']:
cite_styles['parenthetical'] += 1
elif cmd in ['citet', 'textcite']:
cite_styles['textual'] += 1
elif cmd == 'cite':
cite_styles['plain'] += 1
# Check citation style consistency
styles_used = [s for s, count in cite_styles.items() if count > 0]
if len(styles_used) > 1:
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Mixed citation styles detected: {', '.join(styles_used)}",
suggestion="Consider using consistent citation style throughout"
))
# Check for multiple blank lines (3 or more)
for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
line_num = self._find_line_number(tex_content, match.start())
# Count how many blank lines
blank_count = match.group(0).count('\n') - 1
# Get context: the line before, blank lines, and the line after
start_pos = match.start()
end_pos = match.end()
# Find the line before the blank lines
prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
prev_line_end = start_pos
prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
# Find the line after the blank lines
next_line_end = tex_content.find('\n', end_pos)
if next_line_end == -1:
next_line_end = len(tex_content)
next_line = tex_content[end_pos:next_line_end].rstrip()
# Create visual representation with warning markers
blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
results.append(self._create_result(
passed=False,
severity=CheckSeverity.INFO,
message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
line_number=line_num,
line_content=line_content,
suggestion="Reduce to single blank line or use \\vspace"
))
# Check for common issues with special characters
results.extend(self._check_special_chars(tex_content, lines))
return results
def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
"""Check for unescaped special characters."""
results = []
# Find math environments to skip
math_regions = self._find_math_regions(content)
for line_num, line in enumerate(lines, 1):
# Skip commented lines using base class method
if self._is_comment_line(line):
continue
# Remove inline comments using base class method
line_content = self._remove_line_comment(line)
# Get position of this line in full content
line_start = sum(len(l) + 1 for l in lines[:line_num-1])
# Check for unescaped & (common error)
for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content):
pos = line_start + match.start()
# Skip if in math
if not self._in_math_region(pos, math_regions):
# Also skip if inside tabular
if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']):
results.append(self._create_result(
passed=False,
severity=CheckSeverity.WARNING,
message="Unescaped & outside tabular/math environment",
line_number=line_num,
line_content=line.strip()[:100],
suggestion="Use \\& to escape"
))
return results
def _find_math_regions(self, content: str) -> List[tuple]:
"""Find regions that are inside math mode."""
regions = []
# Inline math $ ... $
for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL):
regions.append((match.start(), match.end()))
# Display math $$ ... $$
for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL):
regions.append((match.start(), match.end()))
# \[ ... \]
for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL):
regions.append((match.start(), match.end()))
# Math environments
for env in ['equation', 'align', 'gather', 'multline', 'displaymath']:
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
for match in re.finditer(pattern, content, re.DOTALL):
regions.append((match.start(), match.end()))
return regions
def _in_math_region(self, pos: int, regions: List[tuple]) -> bool:
"""Check if position is inside a math region."""
return any(start <= pos <= end for start, end in regions)
def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
"""Check if position is inside any of the given environments."""
for env in env_names:
# Find all instances of this environment
pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
for match in re.finditer(pattern, content, re.DOTALL):
if match.start() <= pos <= match.end():
return True
return False
|