ricklon Claude Sonnet 4.6 commited on
Commit
9d4d37e
·
1 Parent(s): dc85ea9

Fix MathML rendering: placeholder approach + HTML escaping + \frac OCR fix

Browse files

Three bugs fixed:
- Python-Markdown stopped capturing multi-line <div> blocks at blank lines.
Switch to placeholder approach: extract math → replace with ZZKEY tokens
→ run markdown → swap keys back for MathML/HTML.
- Unescaped '&' in LaTeX fallback blocks broke HTML (e.g. \begin{aligned}&...).
Use html.escape() in the <code> fallback in _to_mathml.
- OCR error \frac{1/2} (single-argument fraction) caused latex2mathml to fail.
Pre-process with re.sub to split into proper \frac{1}{2} form.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +34 -19
app.py CHANGED
@@ -11,6 +11,7 @@ import fitz
11
  import re
12
  import numpy as np
13
  import base64
 
14
  import markdown as md_lib
15
  import latex2mathml.converter
16
 
@@ -153,45 +154,59 @@ math[display="block"] { display: block; overflow-x: auto; max-width: 100%; }
153
 
154
  def _to_mathml(latex: str, display: bool) -> str:
155
  """Convert a LaTeX string to MathML. Falls back to a code block on error."""
 
 
156
  try:
157
  mathml = latex2mathml.converter.convert(latex)
158
  if display:
159
  mathml = re.sub(r'<math\b', '<math display="block"', mathml, count=1)
160
  return mathml
161
  except Exception:
 
162
  if display:
163
- return f'<pre class="math-fallback"><code>{latex}</code></pre>'
164
- return f'<code class="math-fallback">{latex}</code>'
165
 
166
  def to_math_html(text: str) -> str:
167
  """Convert model markdown output to HTML with server-side MathML rendering.
168
 
169
- LaTeX is converted to MathML by latex2mathml (pure Python, no JS required).
170
- The MathML is embedded directly in the HTML before the markdown pass so the
171
- markdown engine never touches LaTeX backslashes or delimiters.
172
  """
173
  if not text:
174
  return ""
175
 
176
- # --- display math \[...\] block MathML wrapped in <div> ---
177
- # Insert as a proper block (blank lines around the div) so Python-Markdown
178
- # treats it as a raw HTML block and leaves it untouched.
179
- def display_block(m):
180
- mathml = _to_mathml(m.group(1).strip(), display=True)
181
- return f'\n\n<div class="math-display">{mathml}</div>\n\n'
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  text = re.sub(r'\\\[(.+?)\\\]', display_block, text, flags=re.DOTALL)
184
-
185
- # Remove orphaned \[ with no matching \] (truncated model output).
186
  text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
 
 
187
 
188
- # --- inline math \(...\) inline MathML ---
189
- text = re.sub(r'\\\((.+?)\\\)',
190
- lambda m: _to_mathml(m.group(1).strip(), display=False),
191
- text)
192
-
193
- # --- remaining text: standard markdown (tables, bold, headings, images…) ---
194
  html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists'])
 
 
 
 
 
 
195
  return f'<div class="math-preview">{html}</div>'
196
 
197
  def embed_images(markdown, crops):
 
11
  import re
12
  import numpy as np
13
  import base64
14
+ import html as html_lib
15
  import markdown as md_lib
16
  import latex2mathml.converter
17
 
 
154
 
155
  def _to_mathml(latex: str, display: bool) -> str:
156
  """Convert a LaTeX string to MathML. Falls back to a code block on error."""
157
+ # Fix OCR error: \frac{n/m} (single-argument fraction) → \frac{n}{m}
158
+ latex = re.sub(r'\\frac\{(\d+)/(\d+)\}(?!\s*\{)', r'\\frac{\1}{\2}', latex)
159
  try:
160
  mathml = latex2mathml.converter.convert(latex)
161
  if display:
162
  mathml = re.sub(r'<math\b', '<math display="block"', mathml, count=1)
163
  return mathml
164
  except Exception:
165
+ escaped = html_lib.escape(latex)
166
  if display:
167
+ return f'<pre class="math-fallback"><code>{escaped}</code></pre>'
168
+ return f'<code class="math-fallback">{escaped}</code>'
169
 
170
  def to_math_html(text: str) -> str:
171
  """Convert model markdown output to HTML with server-side MathML rendering.
172
 
173
+ Uses a placeholder approach: math is extracted and replaced with unique
174
+ tokens before the markdown pass, then swapped back afterwards. This avoids
175
+ Python-Markdown mishandling multi-line <div> blocks that contain blank lines.
176
  """
177
  if not text:
178
  return ""
179
 
180
+ blocks: dict[str, str] = {}
181
+ counter = [0]
 
 
 
 
182
 
183
+ def display_block(m):
184
+ key = f'ZZDISPLAYMATH{counter[0]}ZZ'
185
+ counter[0] += 1
186
+ blocks[key] = f'<div class="math-display">{_to_mathml(m.group(1).strip(), display=True)}</div>'
187
+ return f'\n\n{key}\n\n'
188
+
189
+ def inline_math(m):
190
+ key = f'ZZINLINEMATH{counter[0]}ZZ'
191
+ counter[0] += 1
192
+ blocks[key] = _to_mathml(m.group(1).strip(), display=False)
193
+ return key
194
+
195
+ # Replace display math \[...\] with placeholder tokens
196
  text = re.sub(r'\\\[(.+?)\\\]', display_block, text, flags=re.DOTALL)
197
+ # Remove orphaned \[ with no matching \] (truncated model output)
 
198
  text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
199
+ # Replace inline math \(...\) with placeholder tokens
200
+ text = re.sub(r'\\\((.+?)\\\)', inline_math, text)
201
 
202
+ # Run markdown on text that now contains only safe placeholder tokens
 
 
 
 
 
203
  html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists'])
204
+
205
+ # Swap placeholders back for MathML/HTML (handle <p>KEY</p> wrapping too)
206
+ for key, value in blocks.items():
207
+ html = html.replace(f'<p>{key}</p>', value)
208
+ html = html.replace(key, value)
209
+
210
  return f'<div class="math-preview">{html}</div>'
211
 
212
  def embed_images(markdown, crops):