|
|
""" |
|
|
optimized_latex_processor.py |
|
|
|
|
|
Dependencies: |
|
|
pip install pylatexenc latex2mathml |
|
|
|
|
|
Optional (for more advanced features not used here): |
|
|
pip install sympy |
|
|
|
|
|
Functionality: |
|
|
- sanitize Gemini output (strip ```latex``` fences safely) |
|
|
- detect math heuristically and via parser |
|
|
- extract inline/display math nodes using pylatexenc (MathNodes + Environments) |
|
|
- validate LaTeX with parser + robust balanced-delimiters checks |
|
|
- convert to MathML (latex2mathml) |
|
|
- convert to Unicode with superscript/subscript support |
|
|
""" |
|
|
|
|
|
import re |
|
|
from typing import List, Tuple, Dict, Any, Optional |
|
|
|
|
|
|
|
|
from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError |
|
|
from latex2mathml.converter import convert as latex2mathml_convert |
|
|
|
|
|
|
|
|
class OptimizedLaTeXProcessor: |
|
|
def __init__(self, enable_mathml: bool = True): |
|
|
self.enable_mathml = enable_mathml |
|
|
|
|
|
|
|
|
self.unicode_map = { |
|
|
r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ', |
|
|
r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ', |
|
|
r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω', |
|
|
r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠', |
|
|
r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫', |
|
|
r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷', |
|
|
r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←', |
|
|
} |
|
|
|
|
|
|
|
|
self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz", |
|
|
"⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ") |
|
|
self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx", |
|
|
"₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ") |
|
|
|
|
|
|
|
|
self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)') |
|
|
self._heuristic_math_pat = re.compile( |
|
|
r'(\\frac|\\sum|\\int|\\sqrt|\\alpha|\\beta|\\pi|\\infty|\$|\\\[|\\\]|\^|_|\b(sin|cos|tan|log|ln|lim)\b|[∫∑√∞≤≥≠±×÷])', |
|
|
re.IGNORECASE |
|
|
) |
|
|
|
|
|
self.math_environments = { |
|
|
'equation', 'equation*', 'align', 'align*', 'gather', 'gather*', |
|
|
'split', 'multline', 'flalign' |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sanitize_input(self, text: str) -> str: |
|
|
""" |
|
|
Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX, |
|
|
but preserve the inner LaTeX exactly (do not mangle escaped dollars). |
|
|
""" |
|
|
def _fence_repl(m): |
|
|
return m.group(1) |
|
|
|
|
|
|
|
|
text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL | re.IGNORECASE) |
|
|
|
|
|
text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL) |
|
|
|
|
|
text = text.replace('\r\n', '\n') |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def detect_mathematical_content(self, text: str) -> bool: |
|
|
""" |
|
|
Cheap heuristic followed by parser attempt if heuristic triggered. |
|
|
""" |
|
|
if not text or not text.strip(): |
|
|
return False |
|
|
|
|
|
if self._heuristic_math_pat.search(text): |
|
|
try: |
|
|
walker = LatexWalker(text) |
|
|
nodes, _, _ = walker.get_latex_nodes(pos=0) |
|
|
|
|
|
for n in nodes: |
|
|
if isinstance(n, LatexMathNode): |
|
|
return True |
|
|
if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments: |
|
|
return True |
|
|
return True |
|
|
except Exception: |
|
|
|
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Parse content and extract math nodes (inline $...$ and environments). |
|
|
""" |
|
|
sanitized = self.sanitize_input(content) |
|
|
equations = [] |
|
|
|
|
|
try: |
|
|
walker = LatexWalker(sanitized) |
|
|
nodes, _, _ = walker.get_latex_nodes(pos=0) |
|
|
except Exception: |
|
|
|
|
|
|
|
|
for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL): |
|
|
equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()}) |
|
|
for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL): |
|
|
equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()}) |
|
|
return equations |
|
|
|
|
|
def walk_nodes(node_list: List[LatexNode]): |
|
|
for node in node_list: |
|
|
is_math_node = isinstance(node, LatexMathNode) |
|
|
is_math_env = False |
|
|
|
|
|
|
|
|
if isinstance(node, LatexEnvironmentNode): |
|
|
if node.environmentname in self.math_environments: |
|
|
is_math_env = True |
|
|
|
|
|
if is_math_node or is_math_env: |
|
|
latex_snip = node.latex_verbatim() |
|
|
|
|
|
if is_math_env: |
|
|
typ = 'display' |
|
|
|
|
|
|
|
|
inner_clean = latex_snip.strip() |
|
|
else: |
|
|
|
|
|
delim = getattr(node, 'delimiters', None) |
|
|
displaytype = getattr(node, 'displaytype', None) |
|
|
typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline' |
|
|
|
|
|
|
|
|
|
|
|
if latex_snip.startswith('$$') and latex_snip.endswith('$$'): |
|
|
inner_clean = latex_snip[2:-2].strip() |
|
|
elif latex_snip.startswith('$') and latex_snip.endswith('$'): |
|
|
inner_clean = latex_snip[1:-1].strip() |
|
|
elif latex_snip.startswith(r'\(') and latex_snip.endswith(r'\)'): |
|
|
inner_clean = latex_snip[2:-2].strip() |
|
|
elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'): |
|
|
inner_clean = latex_snip[2:-2].strip() |
|
|
typ = 'display' |
|
|
else: |
|
|
inner_clean = latex_snip |
|
|
|
|
|
equations.append({ |
|
|
'type': typ, |
|
|
'latex': inner_clean, |
|
|
'start_pos': node.pos, |
|
|
'end_pos': node.pos + node.len if hasattr(node, 'len') else None |
|
|
}) |
|
|
else: |
|
|
|
|
|
if hasattr(node, 'nodelist') and node.nodelist: |
|
|
walk_nodes(node.nodelist) |
|
|
|
|
|
walk_nodes(nodes) |
|
|
return equations |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]: |
|
|
""" |
|
|
Validate a single latex snippet. |
|
|
Handles escaped braces correctly to avoid false negatives. |
|
|
""" |
|
|
if latex_code is None: |
|
|
return False, "Empty LaTeX snippet." |
|
|
|
|
|
if not latex_code.strip(): |
|
|
return False, "Empty content." |
|
|
|
|
|
|
|
|
clean_code = re.sub(r'\\.', '', latex_code) |
|
|
|
|
|
|
|
|
if clean_code.count('{') != clean_code.count('}'): |
|
|
return False, "Unbalanced braces: { }" |
|
|
if clean_code.count('[') != clean_code.count(']'): |
|
|
return False, "Unbalanced brackets: [ ]" |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
walker = LatexWalker(latex_code) |
|
|
walker.get_latex_nodes(pos=0) |
|
|
except Exception as e: |
|
|
return False, f"Parser error: {str(e)}" |
|
|
|
|
|
return True, None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]: |
|
|
if not self.enable_mathml: |
|
|
return None |
|
|
try: |
|
|
return latex2mathml_convert(latex_code) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
def convert_latex_to_unicode(self, latex_code: str) -> str: |
|
|
""" |
|
|
Enhanced LaTeX -> Unicode mapping. |
|
|
Includes fractions, superscripts, subscripts, and symbols. |
|
|
""" |
|
|
out = latex_code |
|
|
|
|
|
|
|
|
def _frac_repl(m): |
|
|
return f'({m.group(1).strip()}/{m.group(2).strip()})' |
|
|
out = re.sub(r'\\frac\s*\{\s*([^{}]+?)\s*\}\s*\{\s*([^{}]+?)\s*\}', _frac_repl, out) |
|
|
|
|
|
|
|
|
|
|
|
out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out) |
|
|
|
|
|
out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out) |
|
|
|
|
|
|
|
|
|
|
|
out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out) |
|
|
|
|
|
out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out) |
|
|
|
|
|
|
|
|
for k, v in self.unicode_map.items(): |
|
|
out = out.replace(k, v) |
|
|
|
|
|
|
|
|
out = re.sub(r'\\([A-Za-z]+)', r'\1', out) |
|
|
out = re.sub(r'\s+', ' ', out).strip() |
|
|
|
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]: |
|
|
cleaned = self.sanitize_input(content) |
|
|
equations = self.extract_latex_equations(cleaned) |
|
|
|
|
|
enhanced_equations = [] |
|
|
for eq in equations: |
|
|
latex_snip = eq['latex'] |
|
|
is_valid, error = self.validate_latex(latex_snip) |
|
|
|
|
|
mathml = None |
|
|
if is_valid and convert_mathml and self.enable_mathml: |
|
|
mathml = self.convert_latex_to_mathml(latex_snip) |
|
|
|
|
|
unicode_repr = self.convert_latex_to_unicode(latex_snip) |
|
|
|
|
|
enhanced_equations.append({ |
|
|
'type': eq.get('type', 'inline'), |
|
|
'latex': latex_snip, |
|
|
'valid': is_valid, |
|
|
'error': error, |
|
|
'mathml': mathml, |
|
|
'unicode': unicode_repr, |
|
|
'start_pos': eq.get('start_pos'), |
|
|
'end_pos': eq.get('end_pos') |
|
|
}) |
|
|
|
|
|
return { |
|
|
'cleaned_content': cleaned, |
|
|
'equations': enhanced_equations |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
sample = r""" |
|
|
Here is some text with inline math $E=mc^2$ and escaped dollar \$100. |
|
|
|
|
|
A set definition with escaped braces (this caused bugs before): |
|
|
$S = \{ x \in \mathbb{R} \mid x > 0 \}$ |
|
|
|
|
|
A display equation: |
|
|
$$ |
|
|
\int_0^\infty x^2 e^{-x} \,dx = 2! |
|
|
$$ |
|
|
|
|
|
An aligned environment: |
|
|
\begin{align} |
|
|
a &= b + c \\ |
|
|
d &= e + f |
|
|
\end{align} |
|
|
|
|
|
And a malformed example: $unbalanced { braces $ |
|
|
""" |
|
|
|
|
|
proc = OptimizedLaTeXProcessor(enable_mathml=True) |
|
|
result = proc.process_latex_content(sample) |
|
|
|
|
|
print("--- CLEANED CONTENT (snippet) ---") |
|
|
print(result['cleaned_content'][:100] + "...") |
|
|
|
|
|
print("\n--- EQUATIONS FOUND ---") |
|
|
for i, e in enumerate(result['equations'], 1): |
|
|
print(f"\n#{i} Type: {e['type'].upper()}") |
|
|
print(f" Raw: {e['latex']}") |
|
|
print(f" Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})") |
|
|
print(f" Unicode: {e['unicode']}") |
|
|
if e['mathml']: |
|
|
print(f" MathML: {e['mathml'][:60]}...") |