File size: 8,718 Bytes
46df5f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
Formatting checker.

Validates common LaTeX formatting issues:
- Citation formatting consistency
- Non-breaking spaces before citations
- Special character escaping
- Whitespace issues
"""
import re
from typing import List

from .base import BaseChecker, CheckResult, CheckSeverity


class FormattingChecker(BaseChecker):
    """Check for common LaTeX formatting issues."""
    
    name = "formatting"
    display_name = "Formatting"
    description = "Check citation style, spacing, and special characters"
    
    # Citation commands
    CITE_COMMANDS = ['cite', 'citep', 'citet', 'citealt', 'citealp', 
                     'citeauthor', 'citeyear', 'autocite', 'textcite',
                     'parencite', 'footcite']
    
    # Pattern for citations without non-breaking space
    # Matches: "word \cite" but not "word~\cite"
    CITE_NO_NBSP_PATTERN = re.compile(r'(\w)\s+(\\cite\w*\{)')
    
    # Pattern for multiple consecutive spaces
    MULTI_SPACE_PATTERN = re.compile(r'(?<!\\)  +')
    
    # Pattern for unescaped special characters (outside math mode)
    SPECIAL_CHARS = {
        '%': r'(?<!\\)%',  # Unescaped %
        '&': r'(?<!\\)&(?![a-zA-Z]+;)',  # Unescaped & (not HTML entities)
        '#': r'(?<!\\)#',  # Unescaped #
        '_': r'(?<![\\$])_(?![^$]*\$)',  # Unescaped _ outside math
        '^': r'(?<![\\$])\^(?![^$]*\$)',  # Unescaped ^ outside math
    }
    
    # Multiple blank lines pattern (3 or more blank lines)
    MULTI_BLANK_PATTERN = re.compile(r'\n\s*\n\s*\n\s*\n')
    
    def check(self, tex_content: str, config: dict = None) -> List[CheckResult]:
        results = []
        lines = tex_content.split('\n')
        
        # Track citation style consistency
        cite_styles = {'parenthetical': 0, 'textual': 0, 'plain': 0}
        
        for line_num, line in enumerate(lines, 1):
            # Skip commented lines using base class method
            if self._is_comment_line(line):
                continue
            
            # Remove inline comments using base class method
            line_content = self._remove_line_comment(line)
            
            # Check citation non-breaking space
            for match in self.CITE_NO_NBSP_PATTERN.finditer(line_content):
                results.append(self._create_result(
                    passed=False,
                    severity=CheckSeverity.INFO,
                    message="Citation without non-breaking space",
                    line_number=line_num,
                    line_content=line.strip()[:100],
                    suggestion="Use ~ before \\cite (e.g., 'text~\\cite{key}')"
                ))
            
            # Track citation styles
            for cmd in self.CITE_COMMANDS:
                if re.search(rf'\\{cmd}\b', line_content):
                    if cmd in ['citep', 'parencite', 'autocite']:
                        cite_styles['parenthetical'] += 1
                    elif cmd in ['citet', 'textcite']:
                        cite_styles['textual'] += 1
                    elif cmd == 'cite':
                        cite_styles['plain'] += 1
        
        # Check citation style consistency
        styles_used = [s for s, count in cite_styles.items() if count > 0]
        if len(styles_used) > 1:
            results.append(self._create_result(
                passed=False,
                severity=CheckSeverity.INFO,
                message=f"Mixed citation styles detected: {', '.join(styles_used)}",
                suggestion="Consider using consistent citation style throughout"
            ))
        
        # Check for multiple blank lines (3 or more)
        for match in self.MULTI_BLANK_PATTERN.finditer(tex_content):
            line_num = self._find_line_number(tex_content, match.start())
            # Count how many blank lines
            blank_count = match.group(0).count('\n') - 1
            
            # Get context: the line before, blank lines, and the line after
            start_pos = match.start()
            end_pos = match.end()
            
            # Find the line before the blank lines
            prev_line_start = tex_content.rfind('\n', 0, start_pos) + 1
            prev_line_end = start_pos
            prev_line = tex_content[prev_line_start:prev_line_end].rstrip()
            
            # Find the line after the blank lines
            next_line_end = tex_content.find('\n', end_pos)
            if next_line_end == -1:
                next_line_end = len(tex_content)
            next_line = tex_content[end_pos:next_line_end].rstrip()
            
            # Create visual representation with warning markers
            blank_lines = '\n'.join([f"> blank line ⚠️"] * blank_count)
            line_content = f"{prev_line}\n{blank_lines}\n{next_line}"
            
            results.append(self._create_result(
                passed=False,
                severity=CheckSeverity.INFO,
                message=f"Multiple blank lines ({blank_count} consecutive blank lines)",
                line_number=line_num,
                line_content=line_content,
                suggestion="Reduce to single blank line or use \\vspace"
            ))
        
        # Check for common issues with special characters
        results.extend(self._check_special_chars(tex_content, lines))
        
        return results
    
    def _check_special_chars(self, content: str, lines: List[str]) -> List[CheckResult]:
        """Check for unescaped special characters."""
        results = []
        
        # Find math environments to skip
        math_regions = self._find_math_regions(content)
        
        for line_num, line in enumerate(lines, 1):
            # Skip commented lines using base class method
            if self._is_comment_line(line):
                continue
            
            # Remove inline comments using base class method
            line_content = self._remove_line_comment(line)
            
            # Get position of this line in full content
            line_start = sum(len(l) + 1 for l in lines[:line_num-1])
            
            # Check for unescaped & (common error)
            for match in re.finditer(r'(?<!\\)&(?![a-zA-Z]+;)', line_content):
                pos = line_start + match.start()
                # Skip if in math
                if not self._in_math_region(pos, math_regions):
                    # Also skip if inside tabular
                    if not self._in_environment(content, pos, ['tabular', 'array', 'align', 'matrix']):
                        results.append(self._create_result(
                            passed=False,
                            severity=CheckSeverity.WARNING,
                            message="Unescaped & outside tabular/math environment",
                            line_number=line_num,
                            line_content=line.strip()[:100],
                            suggestion="Use \\& to escape"
                        ))
        
        return results
    
    def _find_math_regions(self, content: str) -> List[tuple]:
        """Find regions that are inside math mode."""
        regions = []
        
        # Inline math $ ... $
        for match in re.finditer(r'(?<!\\)\$(?!\$)(.*?)(?<!\\)\$', content, re.DOTALL):
            regions.append((match.start(), match.end()))
        
        # Display math $$ ... $$
        for match in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', content, re.DOTALL):
            regions.append((match.start(), match.end()))
        
        # \[ ... \]
        for match in re.finditer(r'\\\[(.*?)\\\]', content, re.DOTALL):
            regions.append((match.start(), match.end()))
        
        # Math environments
        for env in ['equation', 'align', 'gather', 'multline', 'displaymath']:
            pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
            for match in re.finditer(pattern, content, re.DOTALL):
                regions.append((match.start(), match.end()))
        
        return regions
    
    def _in_math_region(self, pos: int, regions: List[tuple]) -> bool:
        """Check if position is inside a math region."""
        return any(start <= pos <= end for start, end in regions)
    
    def _in_environment(self, content: str, pos: int, env_names: List[str]) -> bool:
        """Check if position is inside any of the given environments."""
        for env in env_names:
            # Find all instances of this environment
            pattern = rf'\\begin\{{{env}\*?\}}(.*?)\\end\{{{env}\*?\}}'
            for match in re.finditer(pattern, content, re.DOTALL):
                if match.start() <= pos <= match.end():
                    return True
        return False