Spaces:

omgy
/

vero_ps

Sleeping

App Files Files Community

vero_ps / latex_processor.py

omgy

Update latex_processor.py

50ff5d1 verified 2 months ago

raw

history blame contribute delete

13.5 kB

	"""
	optimized_latex_processor.py

	Dependencies:
	pip install pylatexenc latex2mathml

	Optional (for more advanced features not used here):
	pip install sympy

	Functionality:
	- sanitize Gemini output (strip ```latex``` fences safely)
	- detect math heuristically and via parser
	- extract inline/display math nodes using pylatexenc (MathNodes + Environments)
	- validate LaTeX with parser + robust balanced-delimiters checks
	- convert to MathML (latex2mathml)
	- convert to Unicode with superscript/subscript support
	"""

	import re
	from typing import List, Tuple, Dict, Any, Optional

	# pylatexenc imports
	from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexEnvironmentNode, LatexNode, LatexWalkerParseError
	from latex2mathml.converter import convert as latex2mathml_convert


	class OptimizedLaTeXProcessor:
	def __init__(self, enable_mathml: bool = True):
	self.enable_mathml = enable_mathml

	# 1. Basic Symbol Map
	self.unicode_map = {
	r'\alpha': 'α', r'\beta': 'β', r'\gamma': 'γ', r'\delta': 'δ',
	r'\epsilon': 'ε', r'\theta': 'θ', r'\lambda': 'λ', r'\mu': 'μ',
	r'\pi': 'π', r'\sigma': 'σ', r'\phi': 'φ', r'\omega': 'ω',
	r'\infty': '∞', r'\leq': '≤', r'\geq': '≥', r'\neq': '≠',
	r'\approx': '≈', r'\sum': '∑', r'\prod': '∏', r'\int': '∫',
	r'\sqrt': '√', r'\pm': '±', r'\times': '×', r'\div': '÷',
	r'\cdot': '·', r'\rightarrow': '→', r'\leftarrow': '←',
	}

	# 2. Superscript/Subscript Maps
	self.sup_map = str.maketrans("0123456789+-=()abcdefghijklmnopqrstuvwxyz",
	"⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖqʳˢᵗᵘᵛʷˣʸᶻ")
	self.sub_map = str.maketrans("0123456789+-=()aehijklmnoprstuvx",
	"₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓ")

	# 3. Regex patterns
	self._re_unescaped_single_dollar = re.compile(r'(?<!\\)(?<!\$)\$(?!\$)')
	self._heuristic_math_pat = re.compile(
	r'(\\frac\|\\sum\|\\int\|\\sqrt\|\\alpha\|\\beta\|\\pi\|\\infty\|\$\|\\\[\|\\\]\|\^\|_\|\b(sin\|cos\|tan\|log\|ln\|lim)\b\|[∫∑√∞≤≥≠±×÷])',
	re.IGNORECASE
	)
	# Math environments to detect
	self.math_environments = {
	'equation', 'equation', 'align', 'align', 'gather', 'gather*',
	'split', 'multline', 'flalign'
	}

	# ----------------------------
	# Sanitization
	# ----------------------------
	def sanitize_input(self, text: str) -> str:
	"""
	Remove surrounding Markdown fences like ```latex``` or ``` that contain LaTeX,
	but preserve the inner LaTeX exactly (do not mangle escaped dollars).
	"""
	def _fence_repl(m):
	return m.group(1)

	# Remove code fences with optional language specifier
	text = re.sub(r'```(?:latex)?\n(.*?)```', _fence_repl, text, flags=re.DOTALL \| re.IGNORECASE)
	# Handle triple-backtick blocks without newline start
	text = re.sub(r'```(.*?)```', _fence_repl, text, flags=re.DOTALL)
	# Normalize CRLF -> LF
	text = text.replace('\r\n', '\n')
	return text

	# ----------------------------
	# Detection
	# ----------------------------
	def detect_mathematical_content(self, text: str) -> bool:
	"""
	Cheap heuristic followed by parser attempt if heuristic triggered.
	"""
	if not text or not text.strip():
	return False

	if self._heuristic_math_pat.search(text):
	try:
	walker = LatexWalker(text)
	nodes, _, _ = walker.get_latex_nodes(pos=0)
	# Check for MathNodes or Math Environments
	for n in nodes:
	if isinstance(n, LatexMathNode):
	return True
	if isinstance(n, LatexEnvironmentNode) and n.environmentname in self.math_environments:
	return True
	return True # Heuristic matched, no nodes found, return True just in case
	except Exception:
	# If parsing fails, heuristic matched, so we assume math is present
	return True

	return False

	# ----------------------------
	# Extraction
	# ----------------------------
	def extract_latex_equations(self, content: str) -> List[Dict[str, Any]]:
	"""
	Parse content and extract math nodes (inline $...$ and environments).
	"""
	sanitized = self.sanitize_input(content)
	equations = []

	try:
	walker = LatexWalker(sanitized)
	nodes, _, _ = walker.get_latex_nodes(pos=0)
	except Exception:
	# If parser fails entirely, fallback to regex for standard dollar delimiters
	# Note: Regex won't reliably catch \begin{equation} blocks
	for m in re.finditer(r'(?<!\\)\$\$(.*?)(?<!\\)\$\$', sanitized, flags=re.DOTALL):
	equations.append({'type': 'display', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
	for m in re.finditer(r'(?<!\\)(?<!\$)\$(?!\$)(.*?)(?<!\\)(?<!\$)\$(?!\$)', sanitized, flags=re.DOTALL):
	equations.append({'type': 'inline', 'latex': m.group(1).strip(), 'start_pos': m.start(), 'end_pos': m.end()})
	return equations

	def walk_nodes(node_list: List[LatexNode]):
	for node in node_list:
	is_math_node = isinstance(node, LatexMathNode)
	is_math_env = False

	# Check for environments like equation, align
	if isinstance(node, LatexEnvironmentNode):
	if node.environmentname in self.math_environments:
	is_math_env = True

	if is_math_node or is_math_env:
	latex_snip = node.latex_verbatim()

	if is_math_env:
	typ = 'display'
	# For environments, we usually keep \begin{}...\end{}
	# so converters know how to handle alignment.
	inner_clean = latex_snip.strip()
	else:
	# Logic for standard LatexMathNode ($ or $$)
	delim = getattr(node, 'delimiters', None)
	displaytype = getattr(node, 'displaytype', None)
	typ = 'display' if (delim == '$$' or displaytype == 'display') else 'inline'

	# Strip outer delimiters for cleaner processing, unless it matches \[ \] pattern
	# Standardizing on raw content is usually safer for converters
	if latex_snip.startswith('$$') and latex_snip.endswith('$$'):
	inner_clean = latex_snip[2:-2].strip()
	elif latex_snip.startswith('$') and latex_snip.endswith('$'):
	inner_clean = latex_snip[1:-1].strip()
	elif latex_snip.startswith(r'$') and latex_snip.endswith(r'$'):
	inner_clean = latex_snip[2:-2].strip()
	elif latex_snip.startswith(r'\[') and latex_snip.endswith(r'\]'):
	inner_clean = latex_snip[2:-2].strip()
	typ = 'display'
	else:
	inner_clean = latex_snip

	equations.append({
	'type': typ,
	'latex': inner_clean,
	'start_pos': node.pos,
	'end_pos': node.pos + node.len if hasattr(node, 'len') else None
	})
	else:
	# Recursive search inside other nodes (e.g. bold text containing math)
	if hasattr(node, 'nodelist') and node.nodelist:
	walk_nodes(node.nodelist)

	walk_nodes(nodes)
	return equations

	# ----------------------------
	# Validation
	# ----------------------------
	def validate_latex(self, latex_code: str) -> Tuple[bool, Optional[str]]:
	"""
	Validate a single latex snippet.
	Handles escaped braces correctly to avoid false negatives.
	"""
	if latex_code is None:
	return False, "Empty LaTeX snippet."

	if not latex_code.strip():
	return False, "Empty content."

	# 1. Strip escaped characters (like \{, \}, \$) before counting structural delimiters
	clean_code = re.sub(r'\\.', '', latex_code)

	# 2. Check balanced delimiters on cleaned code
	if clean_code.count('{') != clean_code.count('}'):
	return False, "Unbalanced braces: { }"
	if clean_code.count('[') != clean_code.count(']'):
	return False, "Unbalanced brackets: [ ]"

	# 3. Parser Check (on original code)
	try:
	# We wrap it in strict mode check
	walker = LatexWalker(latex_code)
	walker.get_latex_nodes(pos=0)
	except Exception as e:
	return False, f"Parser error: {str(e)}"

	return True, None

	# ----------------------------
	# Conversions
	# ----------------------------
	def convert_latex_to_mathml(self, latex_code: str) -> Optional[str]:
	if not self.enable_mathml:
	return None
	try:
	return latex2mathml_convert(latex_code)
	except Exception:
	return None

	def convert_latex_to_unicode(self, latex_code: str) -> str:
	"""
	Enhanced LaTeX -> Unicode mapping.
	Includes fractions, superscripts, subscripts, and symbols.
	"""
	out = latex_code

	# 1. Handle simple \frac{num}{den} -> (num/den)
	def _frac_repl(m):
	return f'({m.group(1).strip()}/{m.group(2).strip()})'
	out = re.sub(r'\\frac\s\{\s([^{}]+?)\s\}\s\{\s([^{}]+?)\s\}', _frac_repl, out)

	# 2. Superscripts (^)
	# Handle ^{...}
	out = re.sub(r'\^\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sup_map), out)
	# Handle single char ^x
	out = re.sub(r'\^([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sup_map), out)

	# 3. Subscripts (_)
	# Handle _{...}
	out = re.sub(r'_\{([a-zA-Z0-9+\-=()]+)\}', lambda m: m.group(1).translate(self.sub_map), out)
	# Handle single char _x
	out = re.sub(r'_([a-zA-Z0-9])', lambda m: m.group(1).translate(self.sub_map), out)

	# 4. Symbol mapping
	for k, v in self.unicode_map.items():
	out = out.replace(k, v)

	# 5. Cleanup remaining backslashes (simple commands like \text)
	out = re.sub(r'\\([A-Za-z]+)', r'\1', out)
	out = re.sub(r'\s+', ' ', out).strip()

	return out

	# ----------------------------
	# Main Pipeline
	# ----------------------------
	def process_latex_content(self, content: str, convert_mathml: bool = True) -> Dict[str, Any]:
	cleaned = self.sanitize_input(content)
	equations = self.extract_latex_equations(cleaned)

	enhanced_equations = []
	for eq in equations:
	latex_snip = eq['latex']
	is_valid, error = self.validate_latex(latex_snip)

	mathml = None
	if is_valid and convert_mathml and self.enable_mathml:
	mathml = self.convert_latex_to_mathml(latex_snip)

	unicode_repr = self.convert_latex_to_unicode(latex_snip)

	enhanced_equations.append({
	'type': eq.get('type', 'inline'),
	'latex': latex_snip,
	'valid': is_valid,
	'error': error,
	'mathml': mathml,
	'unicode': unicode_repr,
	'start_pos': eq.get('start_pos'),
	'end_pos': eq.get('end_pos')
	})

	return {
	'cleaned_content': cleaned,
	'equations': enhanced_equations
	}


	# ----------------------------
	# Example usage
	# ----------------------------
	if __name__ == "__main__":
	sample = r"""
	Here is some text with inline math $E=mc^2$ and escaped dollar \$100.

	A set definition with escaped braces (this caused bugs before):
	$S = \{ x \in \mathbb{R} \mid x > 0 \}$

	A display equation:
	$$
	\int_0^\infty x^2 e^{-x} \,dx = 2!
	$$

	An aligned environment:
	\begin{align}
	a &= b + c \\
	d &= e + f
	\end{align}

	And a malformed example: $unbalanced { braces $
	"""

	proc = OptimizedLaTeXProcessor(enable_mathml=True)
	result = proc.process_latex_content(sample)

	print("--- CLEANED CONTENT (snippet) ---")
	print(result['cleaned_content'][:100] + "...")

	print("\n--- EQUATIONS FOUND ---")
	for i, e in enumerate(result['equations'], 1):
	print(f"\n#{i} Type: {e['type'].upper()}")
	print(f" Raw: {e['latex']}")
	print(f" Valid: {e['valid']} ({e['error'] if e['error'] else 'OK'})")
	print(f" Unicode: {e['unicode']}")
	if e['mathml']:
	print(f" MathML: {e['mathml'][:60]}...")