omgy commited on
Commit
50ff5d1
Β·
verified Β·
1 Parent(s): 1b83d98

Update latex_processor.py

Browse files
Files changed (1) hide show
  1. latex_processor.py +310 -248
latex_processor.py CHANGED
@@ -1,268 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
- from typing import List, Tuple
3
 
4
- class LaTeXProcessor:
5
- """Processor for LaTeX content in documents"""
6
-
7
- # Common mathematical terms and symbols that indicate math content
8
- MATH_INDICATORS = [
9
- r'\b(equation|formula|theorem|proof|lemma|corollary)\b',
10
- r'[βˆ«βˆ‘βˆβˆšβˆžβ‰€β‰₯β‰ Β±Γ—Γ·βˆˆβˆ‰βŠ‚βŠƒβˆͺβˆ©βˆ€βˆƒβˆ‡βˆ‚]',
11
- r'\d+\s*[+\-*/=]\s*\d+',
12
- r'\b(sin|cos|tan|log|ln|exp|lim|integral|derivative)\b',
13
- r'[a-z]\s*=\s*[a-z0-9]',
14
- r'\^|\d+_\d+',
15
- ]
16
-
17
- def detect_mathematical_content(self, text: str) -> bool:
18
- """
19
- Detect if text contains mathematical/scientific content
20
-
21
- Args:
22
- text: Text to analyze
23
-
24
- Returns:
25
- True if mathematical content is detected
26
- """
27
- text_lower = text.lower()
28
-
29
- for pattern in self.MATH_INDICATORS:
30
- if re.search(pattern, text_lower, re.IGNORECASE):
31
- return True
32
 
33
- return False
34
-
35
- def build_enhancement_prompt(
36
- self,
37
- content: str,
38
- user_instructions: str = "",
39
- doc_type: str = "auto",
40
- include_latex: bool = False
41
- ) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  """
43
- Build comprehensive enhancement prompt for Gemini
44
- This method is kept for backward compatibility but now creates
45
- a LaTeX-focused prompt
46
-
47
- Args:
48
- content: Original document content
49
- user_instructions: User's specific instructions
50
- doc_type: Type of document (auto, academic, technical, business, etc.)
51
- include_latex: Whether to include LaTeX formatting
52
-
53
- Returns:
54
- Complete prompt for Gemini
55
  """
56
- prompt_parts = [
57
- "You are an expert document editor specializing in professional and academic writing.",
58
- "Enhance this document with proper formatting and LaTeX notation where needed.",
59
- ""
60
- ]
61
-
62
- # Add LaTeX instructions if needed
63
- if include_latex:
64
- prompt_parts.extend([
65
- "πŸ”¬ IMPORTANT: This document contains mathematical or scientific content.",
66
- "- Format ALL equations using proper LaTeX notation",
67
- "- Use $...$ for inline equations (e.g., $E = mc^2$)",
68
- "- Use $$...$$ for display equations on their own lines",
69
- "- Use proper LaTeX commands: \\frac{}{}, \\sqrt{}, \\int, \\sum, \\alpha, \\beta, etc.",
70
- "- Convert all mathematical expressions to clean, compilable LaTeX code",
71
- "- Number important equations as needed",
72
- "- Ensure all mathematical notation is professional and consistent",
73
- ""
74
- ])
75
-
76
- # Add document type specific instructions
77
- if doc_type == "academic":
78
- prompt_parts.extend([
79
- "πŸ“š Document Type: Academic/Research Paper",
80
- "- Use formal academic tone",
81
- "- Structure with clear sections (Abstract, Introduction, Methods, Results, Discussion, Conclusion)",
82
- "- Include proper citations where needed (use [Author, Year] format)",
83
- "- Ensure technical accuracy",
84
- ""
85
- ])
86
- elif doc_type == "technical":
87
- prompt_parts.extend([
88
- "πŸ”§ Document Type: Technical Documentation",
89
- "- Use clear, precise technical language",
90
- "- Include code examples in proper formatting if relevant",
91
- "- Use numbered lists for procedures",
92
- "- Add technical diagrams descriptions where helpful",
93
- ""
94
- ])
95
- elif doc_type == "business":
96
- prompt_parts.extend([
97
- "πŸ’Ό Document Type: Business Document",
98
- "- Use professional business tone",
99
- "- Focus on clarity and conciseness",
100
- "- Highlight key points and actionable items",
101
- "- Use bullet points for readability",
102
- ""
103
- ])
104
-
105
- # Add user instructions
106
- if user_instructions:
107
- prompt_parts.extend([
108
- f"πŸ‘€ User's Specific Instructions:",
109
- f"{user_instructions}",
110
- ""
111
- ])
112
-
113
- # Add the content
114
- prompt_parts.extend([
115
- "πŸ“„ Original Document Content:",
116
- "=" * 60,
117
- content,
118
- "=" * 60,
119
- "",
120
- "✨ Please provide the ENHANCED version following all guidelines above.",
121
- "Maintain the document structure but improve quality, clarity, and professionalism.",
122
- "Convert all math to proper LaTeX notation if applicable.",
123
- "Return ONLY the enhanced content, no explanations or meta-commentary.",
124
- ])
125
-
126
- return "\n".join(prompt_parts)
127
-
128
- def process_latex_content(self, content: str) -> str:
129
  """
130
- Process and clean LaTeX content from Gemini output
131
-
132
- Args:
133
- content: Content potentially containing LaTeX
134
-
135
- Returns:
136
- Processed content with valid LaTeX
137
  """
138
- # Remove markdown code blocks if Gemini wrapped the output
139
- content = re.sub(r'```latex\n', '', content)
140
- content = re.sub(r'```\n?', '', content)
141
-
142
- # Ensure proper spacing around inline equations
143
- content = re.sub(r'(\S)\$', r'\1 $', content)
144
- content = re.sub(r'\$(\S)', r'$ \1', content)
145
-
146
- # Ensure display equations are on their own lines
147
- content = re.sub(r'(\S)\$\$', r'\1\n$$', content)
148
- content = re.sub(r'\$\$(\S)', r'$$\n\1', content)
149
-
150
- # Clean up excessive newlines
151
- content = re.sub(r'\n{3,}', '\n\n', content)
152
-
153
- # Fix common LaTeX spacing issues
154
- content = re.sub(r'\$\s+\$', '$$', content) # Remove empty equations
155
-
156
- return content.strip()
157
-
158
- def extract_latex_equations(self, content: str) -> List[Tuple[str, str]]:
 
 
 
159
  """
160
- Extract LaTeX equations from content
161
-
162
- Args:
163
- content: Content containing LaTeX
164
-
165
- Returns:
166
- List of tuples (equation_type, equation_content)
167
- equation_type is either 'inline' or 'display'
168
  """
 
169
  equations = []
170
-
171
- # Extract display equations ($$...$$)
172
- display_pattern = r'\$\$(.*?)\$\$'
173
- for match in re.finditer(display_pattern, content, re.DOTALL):
174
- equations.append(('display', match.group(1).strip()))
175
-
176
- # Extract inline equations ($...$)
177
- inline_pattern = r'(?<!\$)\$(?!\$)(.*?)(?<!\$)\$(?!\$)'
178
- for match in re.finditer(inline_pattern, content):
179
- equations.append(('inline', match.group(1).strip()))
180
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  return equations
182
-
183
- def validate_latex(self, latex_code: str) -> Tuple[bool, str]:
 
 
 
184
  """
185
- Basic validation of LaTeX code
186
-
187
- Args:
188
- latex_code: LaTeX code to validate
189
-
190
- Returns:
191
- Tuple of (is_valid, error_message)
192
  """
193
- # Check for balanced braces
194
- if latex_code.count('{') != latex_code.count('}'):
195
- return False, "Unbalanced braces in LaTeX code"
196
 
197
- # Check for balanced brackets
198
- if latex_code.count('[') != latex_code.count(']'):
199
- return False, "Unbalanced brackets in LaTeX code"
200
-
201
- # Check for balanced dollar signs
202
- single_dollars = len(re.findall(r'(?<!\$)\$(?!\$)', latex_code))
203
- if single_dollars % 2 != 0:
204
- return False, "Unbalanced inline equation markers ($)"
205
-
206
- double_dollars = len(re.findall(r'\$\$', latex_code))
207
- if double_dollars % 2 != 0:
208
- return False, "Unbalanced display equation markers ($$)"
209
-
210
- # Basic validation passed
211
- return True, ""
212
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def convert_latex_to_unicode(self, latex_code: str) -> str:
214
  """
215
- Convert simple LaTeX to Unicode for display in DOCX
216
- (For equations that can be represented in Unicode)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
- Args:
219
- latex_code: LaTeX code
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- Returns:
222
- Unicode representation where possible
223
- """
224
- # Simple conversions for common symbols
225
- conversions = {
226
- r'\\alpha': 'Ξ±',
227
- r'\\beta': 'Ξ²',
228
- r'\\gamma': 'Ξ³',
229
- r'\\delta': 'Ξ΄',
230
- r'\\epsilon': 'Ξ΅',
231
- r'\\theta': 'ΞΈ',
232
- r'\\lambda': 'Ξ»',
233
- r'\\mu': 'ΞΌ',
234
- r'\\pi': 'Ο€',
235
- r'\\sigma': 'Οƒ',
236
- r'\\phi': 'Ο†',
237
- r'\\omega': 'Ο‰',
238
- r'\\infty': '∞',
239
- r'\\leq': '≀',
240
- r'\\geq': 'β‰₯',
241
- r'\\neq': 'β‰ ',
242
- r'\\approx': 'β‰ˆ',
243
- r'\\sum': 'βˆ‘',
244
- r'\\prod': '∏',
245
- r'\\int': '∫',
246
- r'\\sqrt': '√',
247
- r'\\pm': 'Β±',
248
- r'\\times': 'Γ—',
249
- r'\\div': 'Γ·',
250
  }
251
-
252
- result = latex_code
253
- for latex, unicode_char in conversions.items():
254
- result = result.replace(latex, unicode_char)
255
-
256
- return result
 
 
257
 
258
- def enhance_equations(self, content: str) -> str:
259
- """
260
- Enhance mathematical equations in content
261
-
262
- Args:
263
- content: Content with equations
264
-
265
- Returns:
266
- Content with enhanced equations
267
- """
268
- return self.process_latex_content(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ optimized_latex_processor.py
3
+
4
+ Dependencies:
5
+ pip install pylatexenc latex2mathml
6
+
7
+ Optional (for more advanced features not used here):
8
+ pip install sympy
9
+
10
+ Functionality:
11
+ - sanitize Gemini output (strip ```latex``` fences safely)
12
+ - detect math heuristically and via parser
13
+ - extract inline/display math nodes using pylatexenc (MathNodes + Environments)
14
+ - validate LaTeX with parser + robust balanced-delimiters checks
15
+ - convert to MathML (latex2mathml)
16
+ - convert to Unicode with superscript/subscript support
17
+ """
18
+
19
  import re
20
+ from typing import List, Tuple, Dict, Any, Optional
21
 
22
+ # pylatexenc imports
23
+ from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError
24
+ from latex2mathml.converter import convert as latex2mathml_convert
25
+
26
+
27
+ class OptimizedLaTeXProcessor:
28
+ def __init__(self, enable_mathml: bool = True):
29
+ self.enable_mathml = enable_mathml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # 1. Basic Symbol Map
32
+ self.unicode_map = {
33
+ r'\alpha': 'Ξ±', r'\beta': 'Ξ²', r'\gamma': 'Ξ³', r'\delta': 'Ξ΄',
34
+ r'\epsilon': 'Ξ΅', r'\theta': 'ΞΈ', r'\lambda': 'Ξ»', r'\mu': 'ΞΌ',
35
+ r'\pi': 'Ο€', r'\sigma': 'Οƒ', r'\phi': 'Ο†', r'\omega': 'Ο‰',
36
+ r'\infty': '∞', r'\leq': '≀', r'\geq': 'β‰₯', r'\neq': 'β‰ ',
37
+ r'\approx': 'β‰ˆ', r'\sum': 'βˆ‘', r'\prod': '∏', r'\int': '∫',
38
+ r'\sqrt': '√', r'\pm': 'Β±', r'\times': 'Γ—', r'\div': 'Γ·',
39
+ r'\cdot': 'Β·', r'\rightarrow': 'β†’', r'\leftarrow': '←',
40
+ }
41
+
42
+ # 2. Superscript/Subscript Maps
43
+ self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz",
44
+ "β°ΒΉΒ²Β³β΄β΅βΆβ·βΈβΉβΊβ»βΌβ½βΎα΅ƒα΅‡αΆœα΅ˆα΅‰αΆ α΅Κ°β±Κ²α΅Λ‘α΅βΏα΅’α΅–qΚ³Λ’α΅—α΅˜α΅›Κ·Λ£ΚΈαΆ»")
45
+ self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx",
46
+ "β‚€β‚β‚‚β‚ƒβ‚„β‚…β‚†β‚‡β‚ˆβ‚‰β‚Šβ‚‹β‚Œβ‚β‚Žβ‚β‚‘β‚•α΅’β±Όβ‚–β‚—β‚˜β‚™β‚’β‚šα΅£β‚›β‚œα΅€α΅₯β‚“")
47
+
48
+ # 3. Regex patterns
49
+ self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)')
50
+ self._heuristic_math_pat = re.compile(
51
+ r'(\\frac|\\sum|\\int|\\sqrt|\\alpha|\\beta|\\pi|\\infty|\$|\\\[|\\\]|\^|_|\b(sin|cos|tan|log|ln|lim)\b|[βˆ«βˆ‘βˆšβˆžβ‰€β‰₯β‰ Β±Γ—Γ·])',
52
+ re.IGNORECASE
53
+ )
54
+ # Math environments to detect
55
+ self.math_environments = {
56
+ 'equation', 'equation*', 'align', 'align*', 'gather', 'gather*',
57
+ 'split', 'multline', 'flalign'
58
+ }
59
+
60
+ # ----------------------------
61
+ # Sanitization
62
+ # ----------------------------
63
+ def sanitize_input(self, text: str) -> str:
64
  """
65
+ Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX,
66
+ but preserve the inner LaTeX exactly (do not mangle escaped dollars).
 
 
 
 
 
 
 
 
 
 
67
  """
68
+ def _fence_repl(m):
69
+ return m.group(1)
70
+
71
+ # Remove code fences with optional language specifier
72
+ text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE)
73
+ # Handle triple-backtick blocks without newline start
74
+ text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL)
75
+ # Normalize CRLF -> LF
76
+ text = text.replace('\r\n', '\n')
77
+ return text
78
+
79
+ # ----------------------------
80
+ # Detection
81
+ # ----------------------------
82
+ def detect_mathematical_content(self, text: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
84
+ Cheap heuristic followed by parser attempt if heuristic triggered.
 
 
 
 
 
 
85
  """
86
+ if not text or not text.strip():
87
+ return False
88
+
89
+ if self._heuristic_math_pat.search(text):
90
+ try:
91
+ walker = LatexWalker(text)
92
+ nodes, _, _ = walker.get_latex_nodes(pos=0)
93
+ # Check for MathNodes or Math Environments
94
+ for n in nodes:
95
+ if isinstance(n, LatexMathNode):
96
+ return True
97
+ if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments:
98
+ return True
99
+ return True # Heuristic matched, no nodes found, return True just in case
100
+ except Exception:
101
+ # If parsing fails, heuristic matched, so we assume math is present
102
+ return True
103
+
104
+ return False
105
+
106
+ # ----------------------------
107
+ # Extraction
108
+ # ----------------------------
109
+ def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]:
110
  """
111
+ Parse content and extract math nodes (inline $...$ and environments).
 
 
 
 
 
 
 
112
  """
113
+ sanitized = self.sanitize_input(content)
114
  equations = []
115
+
116
+ try:
117
+ walker = LatexWalker(sanitized)
118
+ nodes, _, _ = walker.get_latex_nodes(pos=0)
119
+ except Exception:
120
+ # If parser fails entirely, fallback to regex for standard dollar delimiters
121
+ # Note: Regex won't reliably catch \begin{equation} blocks
122
+ for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL):
123
+ equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
124
+ for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL):
125
+ equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
126
+ return equations
127
+
128
+ def walk_nodes(node_list: List[LatexNode]):
129
+ for node in node_list:
130
+ is_math_node = isinstance(node, LatexMathNode)
131
+ is_math_env = False
132
+
133
+ # Check for environments like equation, align
134
+ if isinstance(node, LatexEnvironmentNode):
135
+ if node.environmentname in self.math_environments:
136
+ is_math_env = True
137
+
138
+ if is_math_node or is_math_env:
139
+ latex_snip = node.latex_verbatim()
140
+
141
+ if is_math_env:
142
+ typ = 'display'
143
+ # For environments, we usually keep \begin{}...\end{}
144
+ # so converters know how to handle alignment.
145
+ inner_clean = latex_snip.strip()
146
+ else:
147
+ # Logic for standard LatexMathNode ($ or $$)
148
+ delim = getattr(node, 'delimiters', None)
149
+ displaytype = getattr(node, 'displaytype', None)
150
+ typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline'
151
+
152
+ # Strip outer delimiters for cleaner processing, unless it matches \[ \] pattern
153
+ # Standardizing on raw content is usually safer for converters
154
+ if latex_snip.startswith('$$') and latex_snip.endswith('$$'):
155
+ inner_clean = latex_snip[2:-2].strip()
156
+ elif latex_snip.startswith('$') and latex_snip.endswith('$'):
157
+ inner_clean = latex_snip[1:-1].strip()
158
+ elif latex_snip.startswith(r'\(') and latex_snip.endswith(r'\)'):
159
+ inner_clean = latex_snip[2:-2].strip()
160
+ elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'):
161
+ inner_clean = latex_snip[2:-2].strip()
162
+ typ = 'display'
163
+ else:
164
+ inner_clean = latex_snip
165
+
166
+ equations.append({
167
+ 'type': typ,
168
+ 'latex': inner_clean,
169
+ 'start_pos': node.pos,
170
+ 'end_pos': node.pos + node.len if hasattr(node, 'len') else None
171
+ })
172
+ else:
173
+ # Recursive search inside other nodes (e.g. bold text containing math)
174
+ if hasattr(node, 'nodelist') and node.nodelist:
175
+ walk_nodes(node.nodelist)
176
+
177
+ walk_nodes(nodes)
178
  return equations
179
+
180
+ # ----------------------------
181
+ # Validation
182
+ # ----------------------------
183
+ def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]:
184
  """
185
+ Validate a single latex snippet.
186
+ Handles escaped braces correctly to avoid false negatives.
 
 
 
 
 
187
  """
188
+ if latex_code is None:
189
+ return False, "Empty LaTeX snippet."
 
190
 
191
+ if not latex_code.strip():
192
+ return False, "Empty content."
193
+
194
+ # 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters
195
+ clean_code = re.sub(r'\\.', '', latex_code)
196
+
197
+ # 2. Check balanced delimiters on cleaned code
198
+ if clean_code.count('{') != clean_code.count('}'):
199
+ return False, "Unbalanced braces: { }"
200
+ if clean_code.count('[') != clean_code.count(']'):
201
+ return False, "Unbalanced brackets: [ ]"
202
+
203
+ # 3. Parser Check (on original code)
204
+ try:
205
+ # We wrap it in strict mode check
206
+ walker = LatexWalker(latex_code)
207
+ walker.get_latex_nodes(pos=0)
208
+ except Exception as e:
209
+ return False, f"Parser error: {str(e)}"
210
+
211
+ return True, None
212
+
213
+ # ----------------------------
214
+ # Conversions
215
+ # ----------------------------
216
+ def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]:
217
+ if not self.enable_mathml:
218
+ return None
219
+ try:
220
+ return latex2mathml_convert(latex_code)
221
+ except Exception:
222
+ return None
223
+
224
  def convert_latex_to_unicode(self, latex_code: str) -> str:
225
  """
226
+ Enhanced LaTeX -> Unicode mapping.
227
+ Includes fractions, superscripts, subscripts, and symbols.
228
+ """
229
+ out = latex_code
230
+
231
+ # 1. Handle simple \frac{num}{den} -> (num/den)
232
+ def _frac_repl(m):
233
+ return f'({m.group(1).strip()}/{m.group(2).strip()})'
234
+ out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out)
235
+
236
+ # 2. Superscripts (^)
237
+ # Handle ^{...}
238
+ out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out)
239
+ # Handle single char ^x
240
+ out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out)
241
+
242
+ # 3. Subscripts (_)
243
+ # Handle _{...}
244
+ out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out)
245
+ # Handle single char _x
246
+ out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out)
247
+
248
+ # 4. Symbol mapping
249
+ for k, v in self.unicode_map.items():
250
+ out = out.replace(k, v)
251
+
252
+ # 5. Cleanup remaining backslashes (simple commands like \text)
253
+ out = re.sub(r'\\([A-Za-z]+)', r'\1', out)
254
+ out = re.sub(r'\s+', ' ', out).strip()
255
 
256
+ return out
257
+
258
+ # ----------------------------
259
+ # Main Pipeline
260
+ # ----------------------------
261
+ def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]:
262
+ cleaned = self.sanitize_input(content)
263
+ equations = self.extract_latex_equations(cleaned)
264
+
265
+ enhanced_equations = []
266
+ for eq in equations:
267
+ latex_snip = eq['latex']
268
+ is_valid, error = self.validate_latex(latex_snip)
269
 
270
+ mathml = None
271
+ if is_valid and convert_mathml and self.enable_mathml:
272
+ mathml = self.convert_latex_to_mathml(latex_snip)
273
+
274
+ unicode_repr = self.convert_latex_to_unicode(latex_snip)
275
+
276
+ enhanced_equations.append({
277
+ 'type': eq.get('type', 'inline'),
278
+ 'latex': latex_snip,
279
+ 'valid': is_valid,
280
+ 'error': error,
281
+ 'mathml': mathml,
282
+ 'unicode': unicode_repr,
283
+ 'start_pos': eq.get('start_pos'),
284
+ 'end_pos': eq.get('end_pos')
285
+ })
286
+
287
+ return {
288
+ 'cleaned_content': cleaned,
289
+ 'equations': enhanced_equations
 
 
 
 
 
 
 
 
 
290
  }
291
+
292
+
293
+ # ----------------------------
294
+ # Example usage
295
+ # ----------------------------
296
+ if __name__ == "__main__":
297
+ sample = r"""
298
+ Here is some text with inline math $E=mc^2$ and escaped dollar \$100.
299
 
300
+ A set definition with escaped braces (this caused bugs before):
301
+ $S = \{ x \in \mathbb{R} \mid x > 0 \}$
302
+
303
+ A display equation:
304
+ $$
305
+ \int_0^\infty x^2 e^{-x} \,dx = 2!
306
+ $$
307
+
308
+ An aligned environment:
309
+ \begin{align}
310
+ a &= b + c \\
311
+ d &= e + f
312
+ \end{align}
313
+
314
+ And a malformed example: $unbalanced { braces $
315
+ """
316
+
317
+ proc = OptimizedLaTeXProcessor(enable_mathml=True)
318
+ result = proc.process_latex_content(sample)
319
+
320
+ print("--- CLEANED CONTENT (snippet) ---")
321
+ print(result['cleaned_content'][:100] + "...")
322
+
323
+ print("\n--- EQUATIONS FOUND ---")
324
+ for i, e in enumerate(result['equations'], 1):
325
+ print(f"\n#{i} Type: {e['type'].upper()}")
326
+ print(f" Raw: {e['latex']}")
327
+ print(f" Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})")
328
+ print(f" Unicode: {e['unicode']}")
329
+ if e['mathml']:
330
+ print(f" MathML: {e['mathml'][:60]}...")