Spaces:

omgy
/

vero_ps

Sleeping

App Files Files Community

omgy commited on Dec 3, 2025

Commit

50ff5d1

verified ·

1 Parent(s): 1b83d98

Update latex_processor.py

Browse files

Files changed (1) hide show

latex_processor.py +310 -248

latex_processor.py CHANGED Viewed

@@ -1,268 +1,330 @@
 import re
-from typing import List, Tuple
-class LaTeXProcessor:
-    """Processor for LaTeX content in documents"""
-    # Common mathematical terms and symbols that indicate math content
-    MATH_INDICATORS = [
-        r'\b(equation|formula|theorem|proof|lemma|corollary)\b',
-        r'[∫∑∏√∞≤≥≠±×÷∈∉⊂⊃∪∩∀∃∇∂]',
-        r'\d+\s*[+\-*/=]\s*\d+',
-        r'\b(sin|cos|tan|log|ln|exp|lim|integral|derivative)\b',
-        r'[a-z]\s*=\s*[a-z0-9]',
-        r'\^|\d+_\d+',
-    ]
-    def detect_mathematical_content(self, text: str) -> bool:
-        """
-        Detect if text contains mathematical/scientific content
-        Args:
-            text: Text to analyze
-        Returns:
-            True if mathematical content is detected
-        """
-        text_lower = text.lower()
-        for pattern in self.MATH_INDICATORS:
-            if re.search(pattern, text_lower, re.IGNORECASE):
-                return True
-        return False
-    def build_enhancement_prompt(
-        self,
-        content: str,
-        user_instructions: str = "",
-        doc_type: str = "auto",
-        include_latex: bool = False
-    ) -> str:
         """
-        Build comprehensive enhancement prompt for Gemini
-        This method is kept for backward compatibility but now creates
-        a LaTeX-focused prompt
-        Args:
-            content: Original document content
-            user_instructions: User's specific instructions
-            doc_type: Type of document (auto, academic, technical, business, etc.)
-            include_latex: Whether to include LaTeX formatting
-        Returns:
-            Complete prompt for Gemini
         """
-        prompt_parts = [
-            "You are an expert document editor specializing in professional and academic writing.",
-            "Enhance this document with proper formatting and LaTeX notation where needed.",
-            ""
-        ]
-        # Add LaTeX instructions if needed
-        if include_latex:
-            prompt_parts.extend([
-                "🔬 IMPORTANT: This document contains mathematical or scientific content.",
-                "- Format ALL equations using proper LaTeX notation",
-                "- Use $...$ for inline equations (e.g., $E = mc^2$)",
-                "- Use $$...$$ for display equations on their own lines",
-                "- Use proper LaTeX commands: \\frac{}{}, \\sqrt{}, \\int, \\sum, \\alpha, \\beta, etc.",
-                "- Convert all mathematical expressions to clean, compilable LaTeX code",
-                "- Number important equations as needed",
-                "- Ensure all mathematical notation is professional and consistent",
-                ""
-            ])
-        # Add document type specific instructions
-        if doc_type == "academic":
-            prompt_parts.extend([
-                "📚 Document Type: Academic/Research Paper",
-                "- Use formal academic tone",
-                "- Structure with clear sections (Abstract, Introduction, Methods, Results, Discussion, Conclusion)",
-                "- Include proper citations where needed (use [Author, Year] format)",
-                "- Ensure technical accuracy",
-                ""
-            ])
-        elif doc_type == "technical":
-            prompt_parts.extend([
-                "🔧 Document Type: Technical Documentation",
-                "- Use clear, precise technical language",
-                "- Include code examples in proper formatting if relevant",
-                "- Use numbered lists for procedures",
-                "- Add technical diagrams descriptions where helpful",
-                ""
-            ])
-        elif doc_type == "business":
-            prompt_parts.extend([
-                "💼 Document Type: Business Document",
-                "- Use professional business tone",
-                "- Focus on clarity and conciseness",
-                "- Highlight key points and actionable items",
-                "- Use bullet points for readability",
-                ""
-            ])
-        # Add user instructions
-        if user_instructions:
-            prompt_parts.extend([
-                f"👤 User's Specific Instructions:",
-                f"{user_instructions}",
-                ""
-            ])
-        # Add the content
-        prompt_parts.extend([
-            "📄 Original Document Content:",
-            "=" * 60,
-            content,
-            "=" * 60,
-            "",
-            "✨ Please provide the ENHANCED version following all guidelines above.",
-            "Maintain the document structure but improve quality, clarity, and professionalism.",
-            "Convert all math to proper LaTeX notation if applicable.",
-            "Return ONLY the enhanced content, no explanations or meta-commentary.",
-        ])
-        return "\n".join(prompt_parts)
-    def process_latex_content(self, content: str) -> str:
         """
-        Process and clean LaTeX content from Gemini output
-        Args:
-            content: Content potentially containing LaTeX
-        Returns:
-            Processed content with valid LaTeX
         """
-        # Remove markdown code blocks if Gemini wrapped the output
-        content = re.sub(r'```latex\n', '', content)
-        content = re.sub(r'```\n?', '', content)
-        # Ensure proper spacing around inline equations
-        content = re.sub(r'(\S)\$', r'\1 $', content)
-        content = re.sub(r'\$(\S)', r'$ \1', content)
-        # Ensure display equations are on their own lines
-        content = re.sub(r'(\S)\$\$', r'\1\n$$', content)
-        content = re.sub(r'\$\$(\S)', r'$$\n\1', content)
-        # Clean up excessive newlines
-        content = re.sub(r'\n{3,}', '\n\n', content)
-        # Fix common LaTeX spacing issues
-        content = re.sub(r'\$\s+\$', '$$', content)  # Remove empty equations
-        return content.strip()
-    def extract_latex_equations(self, content: str) -> List[Tuple[str, str]]:
         """
-        Extract LaTeX equations from content
-        Args:
-            content: Content containing LaTeX
-        Returns:
-            List of tuples (equation_type, equation_content)
-            equation_type is either 'inline' or 'display'
         """
         equations = []
-        # Extract display equations ($$...$$)
-        display_pattern = r'\$\$(.*?)\$\$'
-        for match in re.finditer(display_pattern, content, re.DOTALL):
-            equations.append(('display', match.group(1).strip()))
-        # Extract inline equations ($...$)
-        inline_pattern = r'(?<!\$)\$(?!\$)(.*?)(?<!\$)\$(?!\$)'
-        for match in re.finditer(inline_pattern, content):
-            equations.append(('inline', match.group(1).strip()))
         return equations
-    def validate_latex(self, latex_code: str) -> Tuple[bool, str]:
         """
-        Basic validation of LaTeX code
-        Args:
-            latex_code: LaTeX code to validate
-        Returns:
-            Tuple of (is_valid, error_message)
         """
-        # Check for balanced braces
-        if latex_code.count('{') != latex_code.count('}'):
-            return False, "Unbalanced braces in LaTeX code"
-        # Check for balanced brackets
-        if latex_code.count('[') != latex_code.count(']'):
-            return False, "Unbalanced brackets in LaTeX code"
-        # Check for balanced dollar signs
-        single_dollars = len(re.findall(r'(?<!\$)\$(?!\$)', latex_code))
-        if single_dollars % 2 != 0:
-            return False, "Unbalanced inline equation markers ($)"
-        double_dollars = len(re.findall(r'\$\$', latex_code))
-        if double_dollars % 2 != 0:
-            return False, "Unbalanced display equation markers ($$)"
-        # Basic validation passed
-        return True, ""
     def convert_latex_to_unicode(self, latex_code: str) -> str:
         """
-        Convert simple LaTeX to Unicode for display in DOCX
-        (For equations that can be represented in Unicode)
-        Args:
-            latex_code: LaTeX code
-        Returns:
-            Unicode representation where possible
-        """
-        # Simple conversions for common symbols
-        conversions = {
-            r'\\alpha': 'α',
-            r'\\beta': 'β',
-            r'\\gamma': 'γ',
-            r'\\delta': 'δ',
-            r'\\epsilon': 'ε',
-            r'\\theta': 'θ',
-            r'\\lambda': 'λ',
-            r'\\mu': 'μ',
-            r'\\pi': 'π',
-            r'\\sigma': 'σ',
-            r'\\phi': 'φ',
-            r'\\omega': 'ω',
-            r'\\infty': '∞',
-            r'\\leq': '≤',
-            r'\\geq': '≥',
-            r'\\neq': '≠',
-            r'\\approx': '≈',
-            r'\\sum': '∑',
-            r'\\prod': '∏',
-            r'\\int': '∫',
-            r'\\sqrt': '√',
-            r'\\pm': '±',
-            r'\\times': '×',
-            r'\\div': '÷',
         }
-        result = latex_code
-        for latex, unicode_char in conversions.items():
-            result = result.replace(latex, unicode_char)
-        return result
-    def enhance_equations(self, content: str) -> str:
-        """
-        Enhance mathematical equations in content
-        Args:
-            content: Content with equations
-        Returns:
-            Content with enhanced equations
-        """
-        return self.process_latex_content(content)

+"""
+optimized_latex_processor.py
+Dependencies:
+    pip install pylatexenc latex2mathml
+Optional (for more advanced features not used here):
+    pip install sympy
+Functionality:
+ - sanitize Gemini output (strip ```latex``` fences safely)
+ - detect math heuristically and via parser
+ - extract inline/display math nodes using pylatexenc (MathNodes + Environments)
+ - validate LaTeX with parser + robust balanced-delimiters checks
+ - convert to MathML (latex2mathml)
+ - convert to Unicode with superscript/subscript support
+"""
 import re
+from typing import List, Tuple, Dict, Any, Optional
+# pylatexenc imports
+from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError
+from latex2mathml.converter import convert as latex2mathml_convert
+class OptimizedLaTeXProcessor:
+    def __init__(self, enable_mathml: bool = True):
+        self.enable_mathml = enable_mathml
+        # 1. Basic Symbol Map
+        self.unicode_map = {
+            r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ',
+            r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ',
+            r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω',
+            r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
+            r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫',
+            r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷',
+            r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←',
+        }
+        # 2. Superscript/Subscript Maps
+        self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz",
+                                     "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ")
+        self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx",
+                                     "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ")
+        # 3. Regex patterns
+        self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)')
+        self._heuristic_math_pat = re.compile(
+            r'(\\frac|\\sum|\\int|\\sqrt|\\alpha|\\beta|\\pi|\\infty|\$|\\\[|\\\]|\^|_|\b(sin|cos|tan|log|ln|lim)\b|[∫∑√∞≤≥≠±×÷])',
+            re.IGNORECASE
+        )
+        # Math environments to detect
+        self.math_environments = {
+            'equation', 'equation*', 'align', 'align*', 'gather', 'gather*',
+            'split', 'multline', 'flalign'
+        }
+    # ----------------------------
+    # Sanitization
+    # ----------------------------
+    def sanitize_input(self, text: str) -> str:
         """
+        Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX,
+        but preserve the inner LaTeX exactly (do not mangle escaped dollars).
         """
+        def _fence_repl(m):
+            return m.group(1)
+        # Remove code fences with optional language specifier
+        text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE)
+        # Handle triple-backtick blocks without newline start
+        text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL)
+        # Normalize CRLF -> LF
+        text = text.replace('\r\n', '\n')
+        return text
+    # ----------------------------
+    # Detection
+    # ----------------------------
+    def detect_mathematical_content(self, text: str) -> bool:
         """
+        Cheap heuristic followed by parser attempt if heuristic triggered.
         """
+        if not text or not text.strip():
+            return False
+        if self._heuristic_math_pat.search(text):
+            try:
+                walker = LatexWalker(text)
+                nodes, _, _ = walker.get_latex_nodes(pos=0)
+                # Check for MathNodes or Math Environments
+                for n in nodes:
+                    if isinstance(n, LatexMathNode):
+                        return True
+                    if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments:
+                        return True
+                return True # Heuristic matched, no nodes found, return True just in case
+            except Exception:
+                # If parsing fails, heuristic matched, so we assume math is present
+                return True
+        return False
+    # ----------------------------
+    # Extraction
+    # ----------------------------
+    def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]:
         """
+        Parse content and extract math nodes (inline $...$ and environments).
         """
+        sanitized = self.sanitize_input(content)
         equations = []
+        try:
+            walker = LatexWalker(sanitized)
+            nodes, _, _ = walker.get_latex_nodes(pos=0)
+        except Exception:
+            # If parser fails entirely, fallback to regex for standard dollar delimiters
+            # Note: Regex won't reliably catch \begin{equation} blocks
+            for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL):
+                equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
+            for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL):
+                equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
+            return equations
+        def walk_nodes(node_list: List[LatexNode]):
+            for node in node_list:
+                is_math_node = isinstance(node, LatexMathNode)
+                is_math_env = False
+                # Check for environments like equation, align
+                if isinstance(node, LatexEnvironmentNode):
+                    if node.environmentname in self.math_environments:
+                        is_math_env = True
+                if is_math_node or is_math_env:
+                    latex_snip = node.latex_verbatim()
+                    if is_math_env:
+                        typ = 'display'
+                        # For environments, we usually keep \begin{}...\end{}
+                        # so converters know how to handle alignment.
+                        inner_clean = latex_snip.strip()
+                    else:
+                        # Logic for standard LatexMathNode ($ or $$)
+                        delim = getattr(node, 'delimiters', None)
+                        displaytype = getattr(node, 'displaytype', None)
+                        typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline'
+                        # Strip outer delimiters for cleaner processing, unless it matches \[ \] pattern
+                        # Standardizing on raw content is usually safer for converters
+                        if latex_snip.startswith('$$') and latex_snip.endswith('$$'):
+                            inner_clean = latex_snip[2:-2].strip()
+                        elif latex_snip.startswith('$') and latex_snip.endswith('$'):
+                            inner_clean = latex_snip[1:-1].strip()
+                        elif latex_snip.startswith(r'\(') and latex_snip.endswith(r'\)'):
+                            inner_clean = latex_snip[2:-2].strip()
+                        elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'):
+                            inner_clean = latex_snip[2:-2].strip()
+                            typ = 'display'
+                        else:
+                            inner_clean = latex_snip
+                    equations.append({
+                        'type': typ,
+                        'latex': inner_clean,
+                        'start_pos': node.pos,
+                        'end_pos': node.pos + node.len if hasattr(node, 'len') else None
+                    })
+                else:
+                    # Recursive search inside other nodes (e.g. bold text containing math)
+                    if hasattr(node, 'nodelist') and node.nodelist:
+                        walk_nodes(node.nodelist)
+        walk_nodes(nodes)
         return equations
+    # ----------------------------
+    # Validation
+    # ----------------------------
+    def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]:
         """
+        Validate a single latex snippet.
+        Handles escaped braces correctly to avoid false negatives.
         """
+        if latex_code is None:
+            return False, "Empty LaTeX snippet."
+        if not latex_code.strip():
+            return False, "Empty content."
+        # 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters
+        clean_code = re.sub(r'\\.', '', latex_code)
+        # 2. Check balanced delimiters on cleaned code
+        if clean_code.count('{') != clean_code.count('}'):
+            return False, "Unbalanced braces: { }"
+        if clean_code.count('[') != clean_code.count(']'):
+            return False, "Unbalanced brackets: [ ]"
+        # 3. Parser Check (on original code)
+        try:
+            # We wrap it in strict mode check
+            walker = LatexWalker(latex_code)
+            walker.get_latex_nodes(pos=0)
+        except Exception as e:
+            return False, f"Parser error: {str(e)}"
+        return True, None
+    # ----------------------------
+    # Conversions
+    # ----------------------------
+    def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]:
+        if not self.enable_mathml:
+            return None
+        try:
+            return latex2mathml_convert(latex_code)
+        except Exception:
+            return None
     def convert_latex_to_unicode(self, latex_code: str) -> str:
         """
+        Enhanced LaTeX -> Unicode mapping.
+        Includes fractions, superscripts, subscripts, and symbols.
+        """
+        out = latex_code
+        # 1. Handle simple \frac{num}{den} -> (num/den)
+        def _frac_repl(m):
+            return f'({m.group(1).strip()}/{m.group(2).strip()})'
+        out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out)
+        # 2. Superscripts (^)
+        # Handle ^{...}
+        out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out)
+        # Handle single char ^x
+        out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out)
+        # 3. Subscripts (_)
+        # Handle _{...}
+        out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out)
+        # Handle single char _x
+        out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out)
+        # 4. Symbol mapping
+        for k, v in self.unicode_map.items():
+            out = out.replace(k, v)
+        # 5. Cleanup remaining backslashes (simple commands like \text)
+        out = re.sub(r'\\([A-Za-z]+)', r'\1', out)
+        out = re.sub(r'\s+', ' ', out).strip()
+        return out
+    # ----------------------------
+    # Main Pipeline
+    # ----------------------------
+    def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]:
+        cleaned = self.sanitize_input(content)
+        equations = self.extract_latex_equations(cleaned)
+        enhanced_equations = []
+        for eq in equations:
+            latex_snip = eq['latex']
+            is_valid, error = self.validate_latex(latex_snip)
+            mathml = None
+            if is_valid and convert_mathml and self.enable_mathml:
+                mathml = self.convert_latex_to_mathml(latex_snip)
+            unicode_repr = self.convert_latex_to_unicode(latex_snip)
+            enhanced_equations.append({
+                'type': eq.get('type', 'inline'),
+                'latex': latex_snip,
+                'valid': is_valid,
+                'error': error,
+                'mathml': mathml,
+                'unicode': unicode_repr,
+                'start_pos': eq.get('start_pos'),
+                'end_pos': eq.get('end_pos')
+            })
+        return {
+            'cleaned_content': cleaned,
+            'equations': enhanced_equations
         }
+# ----------------------------
+# Example usage
+# ----------------------------
+if __name__ == "__main__":
+    sample = r"""
+    Here is some text with inline math $E=mc^2$ and escaped dollar \$100.
+    A set definition with escaped braces (this caused bugs before):
+    $S = \{ x \in \mathbb{R} \mid x > 0 \}$
+    A display equation:
+    $$
+    \int_0^\infty x^2 e^{-x} \,dx = 2!
+    $$
+    An aligned environment:
+    \begin{align}
+      a &= b + c \\
+      d &= e + f
+    \end{align}
+    And a malformed example: $unbalanced { braces $
+    """
+    proc = OptimizedLaTeXProcessor(enable_mathml=True)
+    result = proc.process_latex_content(sample)
+    print("--- CLEANED CONTENT (snippet) ---")
+    print(result['cleaned_content'][:100] + "...")
+    print("\n--- EQUATIONS FOUND ---")
+    for i, e in enumerate(result['equations'], 1):
+        print(f"\n#{i} Type: {e['type'].upper()}")
+        print(f"   Raw: {e['latex']}")
+        print(f"   Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})")
+        print(f"   Unicode: {e['unicode']}")
+        if e['mathml']:
+            print(f"   MathML: {e['mathml'][:60]}...")