File size: 13,495 Bytes
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
86f307d
50ff5d1
 
 
 
 
 
 
 
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
 
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
86f307d
50ff5d1
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
 
 
 
 
86f307d
50ff5d1
 
86f307d
50ff5d1
 
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
 
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f307d
50ff5d1
 
 
 
 
 
 
 
86f307d
50ff5d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
"""
optimized_latex_processor.py

Dependencies:
    pip install pylatexenc latex2mathml

Optional (for more advanced features not used here):
    pip install sympy

Functionality:
 - sanitize Gemini output (strip ```latex``` fences safely)
 - detect math heuristically and via parser
 - extract inline/display math nodes using pylatexenc (MathNodes + Environments)
 - validate LaTeX with parser + robust balanced-delimiters checks
 - convert to MathML (latex2mathml)
 - convert to Unicode with superscript/subscript support
"""

import re
from typing import List, Tuple, Dict, Any, Optional

# pylatexenc imports
from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError
from latex2mathml.converter import convert as latex2mathml_convert


class OptimizedLaTeXProcessor:
    def __init__(self, enable_mathml: bool = True):
        self.enable_mathml = enable_mathml
        
        # 1. Basic Symbol Map
        self.unicode_map = {
            r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ',
            r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ',
            r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω',
            r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
            r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫',
            r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷',
            r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←',
        }

        # 2. Superscript/Subscript Maps
        self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz", 
                                     "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ")
        self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx", 
                                     "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ")

        # 3. Regex patterns
        self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)')
        self._heuristic_math_pat = re.compile(
            r'(\\frac|\\sum|\\int|\\sqrt|\\alpha|\\beta|\\pi|\\infty|\$|\\\[|\\\]|\^|_|\b(sin|cos|tan|log|ln|lim)\b|[∫∑√∞≤≥≠±×÷])',
            re.IGNORECASE
        )
        # Math environments to detect
        self.math_environments = {
            'equation', 'equation*', 'align', 'align*', 'gather', 'gather*', 
            'split', 'multline', 'flalign'
        }

    # ----------------------------
    # Sanitization
    # ----------------------------
    def sanitize_input(self, text: str) -> str:
        """
        Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX,
        but preserve the inner LaTeX exactly (do not mangle escaped dollars).
        """
        def _fence_repl(m):
            return m.group(1)

        # Remove code fences with optional language specifier
        text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE)
        # Handle triple-backtick blocks without newline start
        text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL)
        # Normalize CRLF -> LF
        text = text.replace('\r\n', '\n')
        return text

    # ----------------------------
    # Detection
    # ----------------------------
    def detect_mathematical_content(self, text: str) -> bool:
        """
        Cheap heuristic followed by parser attempt if heuristic triggered.
        """
        if not text or not text.strip():
            return False

        if self._heuristic_math_pat.search(text):
            try:
                walker = LatexWalker(text)
                nodes, _, _ = walker.get_latex_nodes(pos=0)
                # Check for MathNodes or Math Environments
                for n in nodes:
                    if isinstance(n, LatexMathNode):
                        return True
                    if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments:
                        return True
                return True # Heuristic matched, no nodes found, return True just in case
            except Exception:
                # If parsing fails, heuristic matched, so we assume math is present
                return True

        return False

    # ----------------------------
    # Extraction
    # ----------------------------
    def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]:
        """
        Parse content and extract math nodes (inline $...$ and environments).
        """
        sanitized = self.sanitize_input(content)
        equations = []

        try:
            walker = LatexWalker(sanitized)
            nodes, _, _ = walker.get_latex_nodes(pos=0)
        except Exception:
            # If parser fails entirely, fallback to regex for standard dollar delimiters
            # Note: Regex won't reliably catch \begin{equation} blocks
            for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL):
                equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
            for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL):
                equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
            return equations

        def walk_nodes(node_list: List[LatexNode]):
            for node in node_list:
                is_math_node = isinstance(node, LatexMathNode)
                is_math_env = False
                
                # Check for environments like equation, align
                if isinstance(node, LatexEnvironmentNode):
                    if node.environmentname in self.math_environments:
                        is_math_env = True

                if is_math_node or is_math_env:
                    latex_snip = node.latex_verbatim()
                    
                    if is_math_env:
                        typ = 'display'
                        # For environments, we usually keep \begin{}...\end{} 
                        # so converters know how to handle alignment.
                        inner_clean = latex_snip.strip()
                    else:
                        # Logic for standard LatexMathNode ($ or $$)
                        delim = getattr(node, 'delimiters', None)
                        displaytype = getattr(node, 'displaytype', None)
                        typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline'
                        
                        # Strip outer delimiters for cleaner processing, unless it matches \[ \] pattern
                        # Standardizing on raw content is usually safer for converters
                        if latex_snip.startswith('$$') and latex_snip.endswith('$$'):
                            inner_clean = latex_snip[2:-2].strip()
                        elif latex_snip.startswith('$') and latex_snip.endswith('$'):
                            inner_clean = latex_snip[1:-1].strip()
                        elif latex_snip.startswith(r'\(') and latex_snip.endswith(r'\)'):
                            inner_clean = latex_snip[2:-2].strip()
                        elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'):
                            inner_clean = latex_snip[2:-2].strip()
                            typ = 'display'
                        else:
                            inner_clean = latex_snip

                    equations.append({
                        'type': typ,
                        'latex': inner_clean,
                        'start_pos': node.pos,
                        'end_pos': node.pos + node.len if hasattr(node, 'len') else None
                    })
                else:
                    # Recursive search inside other nodes (e.g. bold text containing math)
                    if hasattr(node, 'nodelist') and node.nodelist:
                        walk_nodes(node.nodelist)

        walk_nodes(nodes)
        return equations

    # ----------------------------
    # Validation
    # ----------------------------
    def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]:
        """
        Validate a single latex snippet.
        Handles escaped braces correctly to avoid false negatives.
        """
        if latex_code is None:
            return False, "Empty LaTeX snippet."
        
        if not latex_code.strip():
            return False, "Empty content."

        # 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters
        clean_code = re.sub(r'\\.', '', latex_code)

        # 2. Check balanced delimiters on cleaned code
        if clean_code.count('{') != clean_code.count('}'):
            return False, "Unbalanced braces: { }"
        if clean_code.count('[') != clean_code.count(']'):
            return False, "Unbalanced brackets: [ ]"

        # 3. Parser Check (on original code)
        try:
            # We wrap it in strict mode check
            walker = LatexWalker(latex_code)
            walker.get_latex_nodes(pos=0)
        except Exception as e:
            return False, f"Parser error: {str(e)}"

        return True, None

    # ----------------------------
    # Conversions
    # ----------------------------
    def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]:
        if not self.enable_mathml:
            return None
        try:
            return latex2mathml_convert(latex_code)
        except Exception:
            return None

    def convert_latex_to_unicode(self, latex_code: str) -> str:
        """
        Enhanced LaTeX -> Unicode mapping.
        Includes fractions, superscripts, subscripts, and symbols.
        """
        out = latex_code

        # 1. Handle simple \frac{num}{den} -> (num/den)
        def _frac_repl(m):
            return f'({m.group(1).strip()}/{m.group(2).strip()})'
        out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out)

        # 2. Superscripts (^)
        # Handle ^{...}
        out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out)
        # Handle single char ^x
        out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out)

        # 3. Subscripts (_)
        # Handle _{...}
        out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out)
        # Handle single char _x
        out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out)

        # 4. Symbol mapping
        for k, v in self.unicode_map.items():
            out = out.replace(k, v)

        # 5. Cleanup remaining backslashes (simple commands like \text)
        out = re.sub(r'\\([A-Za-z]+)', r'\1', out)
        out = re.sub(r'\s+', ' ', out).strip()
        
        return out

    # ----------------------------
    # Main Pipeline
    # ----------------------------
    def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]:
        cleaned = self.sanitize_input(content)
        equations = self.extract_latex_equations(cleaned)

        enhanced_equations = []
        for eq in equations:
            latex_snip = eq['latex']
            is_valid, error = self.validate_latex(latex_snip)
            
            mathml = None
            if is_valid and convert_mathml and self.enable_mathml:
                mathml = self.convert_latex_to_mathml(latex_snip)
            
            unicode_repr = self.convert_latex_to_unicode(latex_snip)
            
            enhanced_equations.append({
                'type': eq.get('type', 'inline'),
                'latex': latex_snip,
                'valid': is_valid,
                'error': error,
                'mathml': mathml,
                'unicode': unicode_repr,
                'start_pos': eq.get('start_pos'),
                'end_pos': eq.get('end_pos')
            })

        return {
            'cleaned_content': cleaned,
            'equations': enhanced_equations
        }


# ----------------------------
# Example usage
# ----------------------------
if __name__ == "__main__":
    sample = r"""
    Here is some text with inline math $E=mc^2$ and escaped dollar \$100.
    
    A set definition with escaped braces (this caused bugs before):
    $S = \{ x \in \mathbb{R} \mid x > 0 \}$

    A display equation:
    $$
    \int_0^\infty x^2 e^{-x} \,dx = 2!
    $$
    
    An aligned environment:
    \begin{align}
      a &= b + c \\
      d &= e + f
    \end{align}

    And a malformed example: $unbalanced { braces $
    """
    
    proc = OptimizedLaTeXProcessor(enable_mathml=True)
    result = proc.process_latex_content(sample)
    
    print("--- CLEANED CONTENT (snippet) ---")
    print(result['cleaned_content'][:100] + "...") 
    
    print("\n--- EQUATIONS FOUND ---")
    for i, e in enumerate(result['equations'], 1):
        print(f"\n#{i} Type: {e['type'].upper()}")
        print(f"   Raw: {e['latex']}")
        print(f"   Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})")
        print(f"   Unicode: {e['unicode']}")
        if e['mathml']:
            print(f"   MathML: {e['mathml'][:60]}...")