Spaces:

omgy
/

vero_ps

Sleeping

File size: 13,495 Bytes

"""
optimized_latex_processor.py

Dependencies:
    pip install pylatexenc latex2mathml

Optional (for more advanced features not used here):
    pip install sympy

Functionality:
 - sanitize Gemini output (strip ```latex``` fences safely)
 - detect math heuristically and via parser
 - extract inline/display math nodes using pylatexenc (MathNodes + Environments)
 - validate LaTeX with parser + robust balanced-delimiters checks
 - convert to MathML (latex2mathml)
 - convert to Unicode with superscript/subscript support
"""

import re
from typing import List, Tuple, Dict, Any, Optional

# pylatexenc imports
from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError
from latex2mathml.converter import convert as latex2mathml_convert


class OptimizedLaTeXProcessor:
    def __init__(self, enable_mathml: bool = True):
        self.enable_mathml = enable_mathml
        
        # 1. Basic Symbol Map
        self.unicode_map = {
            r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ',
            r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ',
            r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω',
            r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
            r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫',
            r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷',
            r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←',
        }

        # 2. Superscript/Subscript Maps
        self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz", 
                                     "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ")
        self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx", 
                                     "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ")

        # 3. Regex patterns
        self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)')
        self._heuristic_math_pat = re.compile(
            r'(\\frac|\\sum|\\int|\\sqrt|\\alpha|\\beta|\\pi|\\infty|\$|\\\[|\\\]|\^|_|\b(sin|cos|tan|log|ln|lim)\b|[∫∑√∞≤≥≠±×÷])',
            re.IGNORECASE
        )
        # Math environments to detect
        self.math_environments = {
            'equation', 'equation*', 'align', 'align*', 'gather', 'gather*', 
            'split', 'multline', 'flalign'
        }

    # ----------------------------
    # Sanitization
    # ----------------------------
    def sanitize_input(self, text: str) -> str:
        """
        Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX,
        but preserve the inner LaTeX exactly (do not mangle escaped dollars).
        """
        def _fence_repl(m):
            return m.group(1)

        # Remove code fences with optional language specifier
        text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE)
        # Handle triple-backtick blocks without newline start
        text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL)
        # Normalize CRLF -> LF
        text = text.replace('\r\n', '\n')
        return text

    # ----------------------------
    # Detection
    # ----------------------------
    def detect_mathematical_content(self, text: str) -> bool:
        """
        Cheap heuristic followed by parser attempt if heuristic triggered.
        """
        if not text or not text.strip():
            return False

        if self._heuristic_math_pat.search(text):
            try:
                walker = LatexWalker(text)
                nodes, _, _ = walker.get_latex_nodes(pos=0)
                # Check for MathNodes or Math Environments
                for n in nodes:
                    if isinstance(n, LatexMathNode):
                        return True
                    if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments:
                        return True
                return True # Heuristic matched, no nodes found, return True just in case
            except Exception:
                # If parsing fails, heuristic matched, so we assume math is present
                return True

        return False

    # ----------------------------
    # Extraction
    # ----------------------------
    def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]:
        """
        Parse content and extract math nodes (inline $...$ and environments).
        """
        sanitized = self.sanitize_input(content)
        equations = []

        try:
            walker = LatexWalker(sanitized)
            nodes, _, _ = walker.get_latex_nodes(pos=0)
        except Exception:
            # If parser fails entirely, fallback to regex for standard dollar delimiters
            # Note: Regex won't reliably catch \begin{equation} blocks
            for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL):
                equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
            for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL):
                equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
            return equations

        def walk_nodes(node_list: List[LatexNode]):
            for node in node_list:
                is_math_node = isinstance(node, LatexMathNode)
                is_math_env = False
                
                # Check for environments like equation, align
                if isinstance(node, LatexEnvironmentNode):
                    if node.environmentname in self.math_environments:
                        is_math_env = True

                if is_math_node or is_math_env:
                    latex_snip = node.latex_verbatim()
                    
                    if is_math_env:
                        typ = 'display'
                        # For environments, we usually keep \begin{}...\end{} 
                        # so converters know how to handle alignment.
                        inner_clean = latex_snip.strip()
                    else:
                        # Logic for standard LatexMathNode ($ or $$)
                        delim = getattr(node, 'delimiters', None)
                        displaytype = getattr(node, 'displaytype', None)
                        typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline'
                        
                        # Strip outer delimiters for cleaner processing, unless it matches \[ \] pattern
                        # Standardizing on raw content is usually safer for converters
                        if latex_snip.startswith('$$') and latex_snip.endswith('$$'):
                            inner_clean = latex_snip[2:-2].strip()
                        elif latex_snip.startswith('$') and latex_snip.endswith('$'):
                            inner_clean = latex_snip[1:-1].strip()
                        elif latex_snip.startswith(r'\(') and latex_snip.endswith(r'\)'):
                            inner_clean = latex_snip[2:-2].strip()
                        elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'):
                            inner_clean = latex_snip[2:-2].strip()
                            typ = 'display'
                        else:
                            inner_clean = latex_snip

                    equations.append({
                        'type': typ,
                        'latex': inner_clean,
                        'start_pos': node.pos,
                        'end_pos': node.pos + node.len if hasattr(node, 'len') else None
                    })
                else:
                    # Recursive search inside other nodes (e.g. bold text containing math)
                    if hasattr(node, 'nodelist') and node.nodelist:
                        walk_nodes(node.nodelist)

        walk_nodes(nodes)
        return equations

    # ----------------------------
    # Validation
    # ----------------------------
    def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]:
        """
        Validate a single latex snippet.
        Handles escaped braces correctly to avoid false negatives.
        """
        if latex_code is None:
            return False, "Empty LaTeX snippet."
        
        if not latex_code.strip():
            return False, "Empty content."

        # 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters
        clean_code = re.sub(r'\\.', '', latex_code)

        # 2. Check balanced delimiters on cleaned code
        if clean_code.count('{') != clean_code.count('}'):
            return False, "Unbalanced braces: { }"
        if clean_code.count('[') != clean_code.count(']'):
            return False, "Unbalanced brackets: [ ]"

        # 3. Parser Check (on original code)
        try:
            # We wrap it in strict mode check
            walker = LatexWalker(latex_code)
            walker.get_latex_nodes(pos=0)
        except Exception as e:
            return False, f"Parser error: {str(e)}"

        return True, None

    # ----------------------------
    # Conversions
    # ----------------------------
    def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]:
        if not self.enable_mathml:
            return None
        try:
            return latex2mathml_convert(latex_code)
        except Exception:
            return None

    def convert_latex_to_unicode(self, latex_code: str) -> str:
        """
        Enhanced LaTeX -> Unicode mapping.
        Includes fractions, superscripts, subscripts, and symbols.
        """
        out = latex_code

        # 1. Handle simple \frac{num}{den} -> (num/den)
        def _frac_repl(m):
            return f'({m.group(1).strip()}/{m.group(2).strip()})'
        out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out)

        # 2. Superscripts (^)
        # Handle ^{...}
        out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out)
        # Handle single char ^x
        out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out)

        # 3. Subscripts (_)
        # Handle _{...}
        out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out)
        # Handle single char _x
        out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out)

        # 4. Symbol mapping
        for k, v in self.unicode_map.items():
            out = out.replace(k, v)

        # 5. Cleanup remaining backslashes (simple commands like \text)
        out = re.sub(r'\\([A-Za-z]+)', r'\1', out)
        out = re.sub(r'\s+', ' ', out).strip()
        
        return out

    # ----------------------------
    # Main Pipeline
    # ----------------------------
    def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]:
        cleaned = self.sanitize_input(content)
        equations = self.extract_latex_equations(cleaned)

        enhanced_equations = []
        for eq in equations:
            latex_snip = eq['latex']
            is_valid, error = self.validate_latex(latex_snip)
            
            mathml = None
            if is_valid and convert_mathml and self.enable_mathml:
                mathml = self.convert_latex_to_mathml(latex_snip)
            
            unicode_repr = self.convert_latex_to_unicode(latex_snip)
            
            enhanced_equations.append({
                'type': eq.get('type', 'inline'),
                'latex': latex_snip,
                'valid': is_valid,
                'error': error,
                'mathml': mathml,
                'unicode': unicode_repr,
                'start_pos': eq.get('start_pos'),
                'end_pos': eq.get('end_pos')
            })

        return {
            'cleaned_content': cleaned,
            'equations': enhanced_equations
        }


# ----------------------------
# Example usage
# ----------------------------
if __name__ == "__main__":
    sample = r"""
    Here is some text with inline math $E=mc^2$ and escaped dollar \$100.
    
    A set definition with escaped braces (this caused bugs before):
    $S = \{ x \in \mathbb{R} \mid x > 0 \}$

    A display equation:
    $$
    \int_0^\infty x^2 e^{-x} \,dx = 2!
    $$
    
    An aligned environment:
    \begin{align}
      a &= b + c \\
      d &= e + f
    \end{align}

    And a malformed example: $unbalanced { braces $
    """
    
    proc = OptimizedLaTeXProcessor(enable_mathml=True)
    result = proc.process_latex_content(sample)
    
    print("--- CLEANED CONTENT (snippet) ---")
    print(result['cleaned_content'][:100] + "...") 
    
    print("\n--- EQUATIONS FOUND ---")
    for i, e in enumerate(result['equations'], 1):
        print(f"\n#{i} Type: {e['type'].upper()}")
        print(f"   Raw: {e['latex']}")
        print(f"   Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})")
        print(f"   Unicode: {e['unicode']}")
        if e['mathml']:
            print(f"   MathML: {e['mathml'][:60]}...")