vero_ps / latex_processor.py
omgy's picture
Update latex_processor.py
50ff5d1 verified
"""
optimized_latex_processor.py
Dependencies:
pip install pylatexenc latex2mathml
Optional (for more advanced features not used here):
pip install sympy
Functionality:
- sanitize Gemini output (strip ```latex``` fences safely)
- detect math heuristically and via parser
- extract inline/display math nodes using pylatexenc (MathNodes + Environments)
- validate LaTeX with parser + robust balanced-delimiters checks
- convert to MathML (latex2mathml)
- convert to Unicode with superscript/subscript support
"""
import re
from typing import List, Tuple, Dict, Any, Optional
# pylatexenc imports
from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError
from latex2mathml.converter import convert as latex2mathml_convert
class OptimizedLaTeXProcessor:
def __init__(self, enable_mathml: bool = True):
self.enable_mathml = enable_mathml
# 1. Basic Symbol Map
self.unicode_map = {
r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ',
r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ',
r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω',
r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫',
r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷',
r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←',
}
# 2. Superscript/Subscript Maps
self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz",
"⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ")
self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx",
"₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ")
# 3. Regex patterns
self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)')
self._heuristic_math_pat = re.compile(
r'(\\frac|\\sum|\\int|\\sqrt|\\alpha|\\beta|\\pi|\\infty|\$|\\\[|\\\]|\^|_|\b(sin|cos|tan|log|ln|lim)\b|[∫∑√∞≤≥≠±×÷])',
re.IGNORECASE
)
# Math environments to detect
self.math_environments = {
'equation', 'equation*', 'align', 'align*', 'gather', 'gather*',
'split', 'multline', 'flalign'
}
# ----------------------------
# Sanitization
# ----------------------------
def sanitize_input(self, text: str) -> str:
"""
Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX,
but preserve the inner LaTeX exactly (do not mangle escaped dollars).
"""
def _fence_repl(m):
return m.group(1)
# Remove code fences with optional language specifier
text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE)
# Handle triple-backtick blocks without newline start
text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL)
# Normalize CRLF -> LF
text = text.replace('\r\n', '\n')
return text
# ----------------------------
# Detection
# ----------------------------
def detect_mathematical_content(self, text: str) -> bool:
"""
Cheap heuristic followed by parser attempt if heuristic triggered.
"""
if not text or not text.strip():
return False
if self._heuristic_math_pat.search(text):
try:
walker = LatexWalker(text)
nodes, _, _ = walker.get_latex_nodes(pos=0)
# Check for MathNodes or Math Environments
for n in nodes:
if isinstance(n, LatexMathNode):
return True
if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments:
return True
return True # Heuristic matched, no nodes found, return True just in case
except Exception:
# If parsing fails, heuristic matched, so we assume math is present
return True
return False
# ----------------------------
# Extraction
# ----------------------------
def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]:
"""
Parse content and extract math nodes (inline $...$ and environments).
"""
sanitized = self.sanitize_input(content)
equations = []
try:
walker = LatexWalker(sanitized)
nodes, _, _ = walker.get_latex_nodes(pos=0)
except Exception:
# If parser fails entirely, fallback to regex for standard dollar delimiters
# Note: Regex won't reliably catch \begin{equation} blocks
for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL):
equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL):
equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
return equations
def walk_nodes(node_list: List[LatexNode]):
for node in node_list:
is_math_node = isinstance(node, LatexMathNode)
is_math_env = False
# Check for environments like equation, align
if isinstance(node, LatexEnvironmentNode):
if node.environmentname in self.math_environments:
is_math_env = True
if is_math_node or is_math_env:
latex_snip = node.latex_verbatim()
if is_math_env:
typ = 'display'
# For environments, we usually keep \begin{}...\end{}
# so converters know how to handle alignment.
inner_clean = latex_snip.strip()
else:
# Logic for standard LatexMathNode ($ or $$)
delim = getattr(node, 'delimiters', None)
displaytype = getattr(node, 'displaytype', None)
typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline'
# Strip outer delimiters for cleaner processing, unless it matches \[ \] pattern
# Standardizing on raw content is usually safer for converters
if latex_snip.startswith('$$') and latex_snip.endswith('$$'):
inner_clean = latex_snip[2:-2].strip()
elif latex_snip.startswith('$') and latex_snip.endswith('$'):
inner_clean = latex_snip[1:-1].strip()
elif latex_snip.startswith(r'\(') and latex_snip.endswith(r'\)'):
inner_clean = latex_snip[2:-2].strip()
elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'):
inner_clean = latex_snip[2:-2].strip()
typ = 'display'
else:
inner_clean = latex_snip
equations.append({
'type': typ,
'latex': inner_clean,
'start_pos': node.pos,
'end_pos': node.pos + node.len if hasattr(node, 'len') else None
})
else:
# Recursive search inside other nodes (e.g. bold text containing math)
if hasattr(node, 'nodelist') and node.nodelist:
walk_nodes(node.nodelist)
walk_nodes(nodes)
return equations
# ----------------------------
# Validation
# ----------------------------
def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]:
"""
Validate a single latex snippet.
Handles escaped braces correctly to avoid false negatives.
"""
if latex_code is None:
return False, "Empty LaTeX snippet."
if not latex_code.strip():
return False, "Empty content."
# 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters
clean_code = re.sub(r'\\.', '', latex_code)
# 2. Check balanced delimiters on cleaned code
if clean_code.count('{') != clean_code.count('}'):
return False, "Unbalanced braces: { }"
if clean_code.count('[') != clean_code.count(']'):
return False, "Unbalanced brackets: [ ]"
# 3. Parser Check (on original code)
try:
# We wrap it in strict mode check
walker = LatexWalker(latex_code)
walker.get_latex_nodes(pos=0)
except Exception as e:
return False, f"Parser error: {str(e)}"
return True, None
# ----------------------------
# Conversions
# ----------------------------
def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]:
if not self.enable_mathml:
return None
try:
return latex2mathml_convert(latex_code)
except Exception:
return None
def convert_latex_to_unicode(self, latex_code: str) -> str:
"""
Enhanced LaTeX -> Unicode mapping.
Includes fractions, superscripts, subscripts, and symbols.
"""
out = latex_code
# 1. Handle simple \frac{num}{den} -> (num/den)
def _frac_repl(m):
return f'({m.group(1).strip()}/{m.group(2).strip()})'
out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out)
# 2. Superscripts (^)
# Handle ^{...}
out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out)
# Handle single char ^x
out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out)
# 3. Subscripts (_)
# Handle _{...}
out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out)
# Handle single char _x
out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out)
# 4. Symbol mapping
for k, v in self.unicode_map.items():
out = out.replace(k, v)
# 5. Cleanup remaining backslashes (simple commands like \text)
out = re.sub(r'\\([A-Za-z]+)', r'\1', out)
out = re.sub(r'\s+', ' ', out).strip()
return out
# ----------------------------
# Main Pipeline
# ----------------------------
def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]:
cleaned = self.sanitize_input(content)
equations = self.extract_latex_equations(cleaned)
enhanced_equations = []
for eq in equations:
latex_snip = eq['latex']
is_valid, error = self.validate_latex(latex_snip)
mathml = None
if is_valid and convert_mathml and self.enable_mathml:
mathml = self.convert_latex_to_mathml(latex_snip)
unicode_repr = self.convert_latex_to_unicode(latex_snip)
enhanced_equations.append({
'type': eq.get('type', 'inline'),
'latex': latex_snip,
'valid': is_valid,
'error': error,
'mathml': mathml,
'unicode': unicode_repr,
'start_pos': eq.get('start_pos'),
'end_pos': eq.get('end_pos')
})
return {
'cleaned_content': cleaned,
'equations': enhanced_equations
}
# ----------------------------
# Example usage
# ----------------------------
if __name__ == "__main__":
sample = r"""
Here is some text with inline math $E=mc^2$ and escaped dollar \$100.
A set definition with escaped braces (this caused bugs before):
$S = \{ x \in \mathbb{R} \mid x > 0 \}$
A display equation:
$$
\int_0^\infty x^2 e^{-x} \,dx = 2!
$$
An aligned environment:
\begin{align}
a &= b + c \\
d &= e + f
\end{align}
And a malformed example: $unbalanced { braces $
"""
proc = OptimizedLaTeXProcessor(enable_mathml=True)
result = proc.process_latex_content(sample)
print("--- CLEANED CONTENT (snippet) ---")
print(result['cleaned_content'][:100] + "...")
print("\n--- EQUATIONS FOUND ---")
for i, e in enumerate(result['equations'], 1):
print(f"\n#{i} Type: {e['type'].upper()}")
print(f" Raw: {e['latex']}")
print(f" Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})")
print(f" Unicode: {e['unicode']}")
if e['mathml']:
print(f" MathML: {e['mathml'][:60]}...")