""" optimized_latex_processor.py Dependencies: pip install pylatexenc latex2mathml Optional (for more advanced features not used here): pip install sympy Functionality: - sanitize Gemini output (strip ```latex``` fences safely) - detect math heuristically and via parser - extract inline/display math nodes using pylatexenc (MathNodes + Environments) - validate LaTeX with parser + robust balanced-delimiters checks - convert to MathML (latex2mathml) - convert to Unicode with superscript/subscript support """ import re from typing import List, Tuple, Dict, Any, Optional # pylatexenc imports from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError from latex2mathml.converter import convert as latex2mathml_convert class OptimizedLaTeXProcessor: def __init__(self, enable_mathml: bool = True): self.enable_mathml = enable_mathml # 1. Basic Symbol Map self.unicode_map = { r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ', r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ', r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω', r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠', r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫', r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷', r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←', } # 2. Superscript/Subscript Maps self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz", "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ") self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx", "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ") # 3. Regex patterns self._re_unescaped_single_dollar = re.compile(r'(? str: """ Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX, but preserve the inner LaTeX exactly (do not mangle escaped dollars). """ def _fence_repl(m): return m.group(1) # Remove code fences with optional language specifier text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE) # Handle triple-backtick blocks without newline start text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL) # Normalize CRLF -> LF text = text.replace('\r\n', '\n') return text # ---------------------------- # Detection # ---------------------------- def detect_mathematical_content(self, text: str) -> bool: """ Cheap heuristic followed by parser attempt if heuristic triggered. """ if not text or not text.strip(): return False if self._heuristic_math_pat.search(text): try: walker = LatexWalker(text) nodes, _, _ = walker.get_latex_nodes(pos=0) # Check for MathNodes or Math Environments for n in nodes: if isinstance(n, LatexMathNode): return True if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments: return True return True # Heuristic matched, no nodes found, return True just in case except Exception: # If parsing fails, heuristic matched, so we assume math is present return True return False # ---------------------------- # Extraction # ---------------------------- def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]: """ Parse content and extract math nodes (inline $...$ and environments). """ sanitized = self.sanitize_input(content) equations = [] try: walker = LatexWalker(sanitized) nodes, _, _ = walker.get_latex_nodes(pos=0) except Exception: # If parser fails entirely, fallback to regex for standard dollar delimiters # Note: Regex won't reliably catch \begin{equation} blocks for m in re.finditer(r'(? Tuple[bool, Optional[str]]: """ Validate a single latex snippet. Handles escaped braces correctly to avoid false negatives. """ if latex_code is None: return False, "Empty LaTeX snippet." if not latex_code.strip(): return False, "Empty content." # 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters clean_code = re.sub(r'\\.', '', latex_code) # 2. Check balanced delimiters on cleaned code if clean_code.count('{') != clean_code.count('}'): return False, "Unbalanced braces: { }" if clean_code.count('[') != clean_code.count(']'): return False, "Unbalanced brackets: [ ]" # 3. Parser Check (on original code) try: # We wrap it in strict mode check walker = LatexWalker(latex_code) walker.get_latex_nodes(pos=0) except Exception as e: return False, f"Parser error: {str(e)}" return True, None # ---------------------------- # Conversions # ---------------------------- def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]: if not self.enable_mathml: return None try: return latex2mathml_convert(latex_code) except Exception: return None def convert_latex_to_unicode(self, latex_code: str) -> str: """ Enhanced LaTeX -> Unicode mapping. Includes fractions, superscripts, subscripts, and symbols. """ out = latex_code # 1. Handle simple \frac{num}{den} -> (num/den) def _frac_repl(m): return f'({m.group(1).strip()}/{m.group(2).strip()})' out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out) # 2. Superscripts (^) # Handle ^{...} out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out) # Handle single char ^x out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out) # 3. Subscripts (_) # Handle _{...} out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out) # Handle single char _x out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out) # 4. Symbol mapping for k, v in self.unicode_map.items(): out = out.replace(k, v) # 5. Cleanup remaining backslashes (simple commands like \text) out = re.sub(r'\\([A-Za-z]+)', r'\1', out) out = re.sub(r'\s+', ' ', out).strip() return out # ---------------------------- # Main Pipeline # ---------------------------- def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]: cleaned = self.sanitize_input(content) equations = self.extract_latex_equations(cleaned) enhanced_equations = [] for eq in equations: latex_snip = eq['latex'] is_valid, error = self.validate_latex(latex_snip) mathml = None if is_valid and convert_mathml and self.enable_mathml: mathml = self.convert_latex_to_mathml(latex_snip) unicode_repr = self.convert_latex_to_unicode(latex_snip) enhanced_equations.append({ 'type': eq.get('type', 'inline'), 'latex': latex_snip, 'valid': is_valid, 'error': error, 'mathml': mathml, 'unicode': unicode_repr, 'start_pos': eq.get('start_pos'), 'end_pos': eq.get('end_pos') }) return { 'cleaned_content': cleaned, 'equations': enhanced_equations } # ---------------------------- # Example usage # ---------------------------- if __name__ == "__main__": sample = r""" Here is some text with inline math $E=mc^2$ and escaped dollar \$100. A set definition with escaped braces (this caused bugs before): $S = \{ x \in \mathbb{R} \mid x > 0 \}$ A display equation: $$ \int_0^\infty x^2 e^{-x} \,dx = 2! $$ An aligned environment: \begin{align} a &= b + c \\ d &= e + f \end{align} And a malformed example: $unbalanced { braces $ """ proc = OptimizedLaTeXProcessor(enable_mathml=True) result = proc.process_latex_content(sample) print("--- CLEANED CONTENT (snippet) ---") print(result['cleaned_content'][:100] + "...") print("\n--- EQUATIONS FOUND ---") for i, e in enumerate(result['equations'], 1): print(f"\n#{i} Type: {e['type'].upper()}") print(f" Raw: {e['latex']}") print(f" Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})") print(f" Unicode: {e['unicode']}") if e['mathml']: print(f" MathML: {e['mathml'][:60]}...")