Spaces:
Running on Zero
Running on Zero
Fix MathML rendering: placeholder approach + HTML escaping + \frac OCR fix
Browse filesThree bugs fixed:
- Python-Markdown stopped capturing multi-line <div> blocks at blank lines.
Switch to placeholder approach: extract math → replace with ZZKEY tokens
→ run markdown → swap keys back for MathML/HTML.
- Unescaped '&' in LaTeX fallback blocks broke HTML (e.g. \begin{aligned}&...).
Use html.escape() in the <code> fallback in _to_mathml.
- OCR error \frac{1/2} (single-argument fraction) caused latex2mathml to fail.
Pre-process with re.sub to split into proper \frac{1}{2} form.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -11,6 +11,7 @@ import fitz
|
|
| 11 |
import re
|
| 12 |
import numpy as np
|
| 13 |
import base64
|
|
|
|
| 14 |
import markdown as md_lib
|
| 15 |
import latex2mathml.converter
|
| 16 |
|
|
@@ -153,45 +154,59 @@ math[display="block"] { display: block; overflow-x: auto; max-width: 100%; }
|
|
| 153 |
|
| 154 |
def _to_mathml(latex: str, display: bool) -> str:
|
| 155 |
"""Convert a LaTeX string to MathML. Falls back to a code block on error."""
|
|
|
|
|
|
|
| 156 |
try:
|
| 157 |
mathml = latex2mathml.converter.convert(latex)
|
| 158 |
if display:
|
| 159 |
mathml = re.sub(r'<math\b', '<math display="block"', mathml, count=1)
|
| 160 |
return mathml
|
| 161 |
except Exception:
|
|
|
|
| 162 |
if display:
|
| 163 |
-
return f'<pre class="math-fallback"><code>{
|
| 164 |
-
return f'<code class="math-fallback">{
|
| 165 |
|
| 166 |
def to_math_html(text: str) -> str:
|
| 167 |
"""Convert model markdown output to HTML with server-side MathML rendering.
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
"""
|
| 173 |
if not text:
|
| 174 |
return ""
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
# treats it as a raw HTML block and leaves it untouched.
|
| 179 |
-
def display_block(m):
|
| 180 |
-
mathml = _to_mathml(m.group(1).strip(), display=True)
|
| 181 |
-
return f'\n\n<div class="math-display">{mathml}</div>\n\n'
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
text = re.sub(r'\\\[(.+?)\\\]', display_block, text, flags=re.DOTALL)
|
| 184 |
-
|
| 185 |
-
# Remove orphaned \[ with no matching \] (truncated model output).
|
| 186 |
text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
#
|
| 189 |
-
text = re.sub(r'\\\((.+?)\\\)',
|
| 190 |
-
lambda m: _to_mathml(m.group(1).strip(), display=False),
|
| 191 |
-
text)
|
| 192 |
-
|
| 193 |
-
# --- remaining text: standard markdown (tables, bold, headings, images…) ---
|
| 194 |
html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
return f'<div class="math-preview">{html}</div>'
|
| 196 |
|
| 197 |
def embed_images(markdown, crops):
|
|
|
|
| 11 |
import re
|
| 12 |
import numpy as np
|
| 13 |
import base64
|
| 14 |
+
import html as html_lib
|
| 15 |
import markdown as md_lib
|
| 16 |
import latex2mathml.converter
|
| 17 |
|
|
|
|
| 154 |
|
| 155 |
def _to_mathml(latex: str, display: bool) -> str:
|
| 156 |
"""Convert a LaTeX string to MathML. Falls back to a code block on error."""
|
| 157 |
+
# Fix OCR error: \frac{n/m} (single-argument fraction) → \frac{n}{m}
|
| 158 |
+
latex = re.sub(r'\\frac\{(\d+)/(\d+)\}(?!\s*\{)', r'\\frac{\1}{\2}', latex)
|
| 159 |
try:
|
| 160 |
mathml = latex2mathml.converter.convert(latex)
|
| 161 |
if display:
|
| 162 |
mathml = re.sub(r'<math\b', '<math display="block"', mathml, count=1)
|
| 163 |
return mathml
|
| 164 |
except Exception:
|
| 165 |
+
escaped = html_lib.escape(latex)
|
| 166 |
if display:
|
| 167 |
+
return f'<pre class="math-fallback"><code>{escaped}</code></pre>'
|
| 168 |
+
return f'<code class="math-fallback">{escaped}</code>'
|
| 169 |
|
| 170 |
def to_math_html(text: str) -> str:
|
| 171 |
"""Convert model markdown output to HTML with server-side MathML rendering.
|
| 172 |
|
| 173 |
+
Uses a placeholder approach: math is extracted and replaced with unique
|
| 174 |
+
tokens before the markdown pass, then swapped back afterwards. This avoids
|
| 175 |
+
Python-Markdown mishandling multi-line <div> blocks that contain blank lines.
|
| 176 |
"""
|
| 177 |
if not text:
|
| 178 |
return ""
|
| 179 |
|
| 180 |
+
blocks: dict[str, str] = {}
|
| 181 |
+
counter = [0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
def display_block(m):
|
| 184 |
+
key = f'ZZDISPLAYMATH{counter[0]}ZZ'
|
| 185 |
+
counter[0] += 1
|
| 186 |
+
blocks[key] = f'<div class="math-display">{_to_mathml(m.group(1).strip(), display=True)}</div>'
|
| 187 |
+
return f'\n\n{key}\n\n'
|
| 188 |
+
|
| 189 |
+
def inline_math(m):
|
| 190 |
+
key = f'ZZINLINEMATH{counter[0]}ZZ'
|
| 191 |
+
counter[0] += 1
|
| 192 |
+
blocks[key] = _to_mathml(m.group(1).strip(), display=False)
|
| 193 |
+
return key
|
| 194 |
+
|
| 195 |
+
# Replace display math \[...\] with placeholder tokens
|
| 196 |
text = re.sub(r'\\\[(.+?)\\\]', display_block, text, flags=re.DOTALL)
|
| 197 |
+
# Remove orphaned \[ with no matching \] (truncated model output)
|
|
|
|
| 198 |
text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
|
| 199 |
+
# Replace inline math \(...\) with placeholder tokens
|
| 200 |
+
text = re.sub(r'\\\((.+?)\\\)', inline_math, text)
|
| 201 |
|
| 202 |
+
# Run markdown on text that now contains only safe placeholder tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists'])
|
| 204 |
+
|
| 205 |
+
# Swap placeholders back for MathML/HTML (handle <p>KEY</p> wrapping too)
|
| 206 |
+
for key, value in blocks.items():
|
| 207 |
+
html = html.replace(f'<p>{key}</p>', value)
|
| 208 |
+
html = html.replace(key, value)
|
| 209 |
+
|
| 210 |
return f'<div class="math-preview">{html}</div>'
|
| 211 |
|
| 212 |
def embed_images(markdown, crops):
|