ricklon commited on
Commit
4d410ae
·
1 Parent(s): 5c1486c

Add spatial HTML preview mode using grounding boxes

Browse files
Files changed (1) hide show
  1. app.py +243 -53
app.py CHANGED
@@ -9,6 +9,7 @@ import shutil
9
  from PIL import Image, ImageDraw, ImageFont, ImageOps
10
  import fitz
11
  import re
 
12
  import numpy as np
13
  import base64
14
  import html as html_lib
@@ -148,10 +149,87 @@ math[display="block"] { display: block; overflow-x: auto; max-width: 100%; }
148
  .math-preview pre code { background: none; padding: 0; }
149
  .math-preview blockquote { border-left: 4px solid #ccc; margin: 0.8em 0; padding: 0.4em 1em; color: #555; background: #fafafa; }
150
  .math-preview img { max-width: 100%; height: auto; display: block; margin: 0.8em 0; }
 
151
  .math-fallback { color: #888; font-style: italic; }
152
  </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  """
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def _to_mathml(latex: str, display: bool) -> str:
156
  """Convert a LaTeX string to MathML. Falls back to a code block on error."""
157
  # Fix OCR error: \frac{n/m} (single-argument fraction) → \frac{n}{m}
@@ -203,6 +281,7 @@ def to_math_html(text: str) -> str:
203
  text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
204
  # Replace inline math \(...\) with placeholder tokens
205
  text = re.sub(r'\\\((.+?)\\\)', inline_math, text)
 
206
 
207
  # Run markdown on text that now contains only safe placeholder tokens
208
  html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
@@ -233,63 +312,170 @@ def to_math_html(text: str) -> str:
233
  for key, literal in literals.items():
234
  html = html.replace(key, html_lib.escape(literal))
235
 
 
 
236
  return f'<div class="math-preview">{html}</div>'
237
 
238
  def to_mathjax_html(text: str) -> str:
239
  """Render markdown to HTML and typeset math client-side with MathJax."""
240
  if not text:
241
  return ""
 
242
  html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
243
- return f"""<!doctype html>
244
- <html>
245
- <head>
246
- <meta charset="utf-8" />
247
- <style>
248
- body {{
249
- margin: 0;
250
- font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
251
- font-size: 15px;
252
- line-height: 1.8;
253
- color: #1a1a1a;
254
- }}
255
- .mathjax-preview {{
256
- padding: 1.5em;
257
- max-width: 100%;
258
- overflow-x: auto;
259
- }}
260
- .mathjax-preview h1 {{ font-size: 1.8em; font-weight: 700; margin: 1em 0 0.4em; border-bottom: 2px solid #e0e0e0; padding-bottom: 0.3em; }}
261
- .mathjax-preview h2 {{ font-size: 1.4em; font-weight: 600; margin: 1em 0 0.4em; border-bottom: 1px solid #e0e0e0; padding-bottom: 0.2em; }}
262
- .mathjax-preview h3 {{ font-size: 1.15em; font-weight: 600; margin: 0.9em 0 0.3em; }}
263
- .mathjax-preview p {{ margin: 0.6em 0; }}
264
- .mathjax-preview ul, .mathjax-preview ol {{ padding-left: 1.8em; margin: 0.5em 0; }}
265
- .mathjax-preview li {{ margin: 0.25em 0; }}
266
- .mathjax-preview table {{ border-collapse: collapse; width: 100%; margin: 1em 0; font-size: 0.95em; }}
267
- .mathjax-preview th, .mathjax-preview td {{ border: 1px solid #ccc; padding: 0.45em 0.75em; text-align: left; }}
268
- .mathjax-preview th {{ background: #f2f2f2; font-weight: 600; }}
269
- .mathjax-preview tr:nth-child(even) {{ background: #fafafa; }}
270
- .mathjax-preview code {{ background: #f4f4f4; padding: 0.15em 0.4em; border-radius: 3px; font-family: 'Courier New', monospace; font-size: 0.88em; }}
271
- .mathjax-preview pre {{ background: #f4f4f4; padding: 1em; border-radius: 5px; overflow-x: auto; margin: 0.8em 0; }}
272
- .mathjax-preview pre code {{ background: none; padding: 0; }}
273
- .mathjax-preview blockquote {{ border-left: 4px solid #ccc; margin: 0.8em 0; padding: 0.4em 1em; color: #555; background: #fafafa; }}
274
- .mathjax-preview img {{ max-width: 100%; height: auto; display: block; margin: 0.8em 0; }}
275
- </style>
276
- <script>
277
- window.MathJax = {{
278
- tex: {{
279
- inlineMath: [['\\\\(', '\\\\)'], ['$', '$']],
280
- displayMath: [['\\\\[', '\\\\]'], ['$$', '$$']]
281
- }},
282
- options: {{
283
- skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
284
- }}
285
- }};
286
- </script>
287
- <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
288
- </head>
289
- <body>
290
- <div class="mathjax-preview">{html}</div>
291
- </body>
292
- </html>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
  def embed_images(markdown, crops):
295
  if not crops:
@@ -461,6 +647,9 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
461
  with gr.Tab("HTML + MathJax", id="tab_html"):
462
  html_out = gr.HTML("")
463
  html_source_out = gr.Code(label="Generated HTML Source", language="html", lines=16)
 
 
 
464
  with gr.Tab("Boxes", id="tab_boxes"):
465
  img_out = gr.Image(type="pil", height=500, show_label=False)
466
  with gr.Tab("Cropped Images", id="tab_crops"):
@@ -520,7 +709,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
520
  elif image is not None:
521
  cleaned, markdown, raw, img_out, crops = process_image(image, task, custom_prompt)
522
  else:
523
- return "Error: Upload a file or image", "", "", "", "", None, [], gr.DownloadButton(visible=False)
524
 
525
  # Text tab: convert \[...\] → $$...$$ and \(...\) → $...$ for readability
526
  text_display = re.sub(r'\\\[(.+?)\\\]',
@@ -534,12 +723,13 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
534
  dl_tmp.close()
535
 
536
  mathjax_html = to_mathjax_html(markdown)
 
537
 
538
- return (text_display, to_math_html(markdown), mathjax_html, mathjax_html, raw, img_out, crops,
539
  gr.DownloadButton(value=dl_tmp.name, visible=True))
540
 
541
  submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
542
- [text_out, md_out, html_out, html_source_out, raw_out, img_out, gallery, download_btn])
543
  submit_event.then(select_boxes, [task], [tabs])
544
 
545
  if __name__ == "__main__":
 
9
  from PIL import Image, ImageDraw, ImageFont, ImageOps
10
  import fitz
11
  import re
12
+ import ast
13
  import numpy as np
14
  import base64
15
  import html as html_lib
 
149
  .math-preview pre code { background: none; padding: 0; }
150
  .math-preview blockquote { border-left: 4px solid #ccc; margin: 0.8em 0; padding: 0.4em 1em; color: #555; background: #fafafa; }
151
  .math-preview img { max-width: 100%; height: auto; display: block; margin: 0.8em 0; }
152
+ .math-preview .ocr-gap, .mathjax-preview .ocr-gap { width: 100%; }
153
  .math-fallback { color: #888; font-style: italic; }
154
  </style>
155
+ <script>
156
+ (() => {
157
+ if (window.__ocrMathJaxInit) return;
158
+ window.__ocrMathJaxInit = true;
159
+
160
+ if (!window.MathJax) {
161
+ window.MathJax = {
162
+ tex: {
163
+ inlineMath: [['\\\\(', '\\\\)'], ['$', '$']],
164
+ displayMath: [['\\\\[', '\\\\]'], ['$$', '$$']]
165
+ },
166
+ options: {
167
+ skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
168
+ }
169
+ };
170
+ }
171
+
172
+ const typeset = () => {
173
+ if (window.MathJax?.typesetPromise) {
174
+ const nodes = Array.from(document.querySelectorAll('.mathjax-preview, .spatial-preview'));
175
+ if (nodes.length) window.MathJax.typesetPromise(nodes).catch(() => {});
176
+ }
177
+ };
178
+ window.__typesetOcrMath = typeset;
179
+
180
+ const ensureScript = () => {
181
+ if (document.getElementById('mathjax-ocr-preview')) return;
182
+ const script = document.createElement('script');
183
+ script.id = 'mathjax-ocr-preview';
184
+ script.async = true;
185
+ script.src = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js';
186
+ script.onload = () => setTimeout(typeset, 20);
187
+ document.head.appendChild(script);
188
+ };
189
+
190
+ ensureScript();
191
+ setTimeout(typeset, 100);
192
+
193
+ const observer = new MutationObserver((mutations) => {
194
+ for (const m of mutations) {
195
+ for (const n of m.addedNodes) {
196
+ if (n.nodeType !== 1) continue;
197
+ if (n.matches?.('.mathjax-preview, .spatial-preview') || n.querySelector?.('.mathjax-preview, .spatial-preview')) {
198
+ setTimeout(typeset, 30);
199
+ return;
200
+ }
201
+ }
202
+ }
203
+ });
204
+ observer.observe(document.body, { childList: true, subtree: true });
205
+ })();
206
+ </script>
207
  """
208
 
209
+ def _inject_spatial_gap_placeholders(text: str):
210
+ """Preserve runs of blank lines so OCR spacing is visible in preview."""
211
+ gaps: dict[str, int] = {}
212
+ counter = [0]
213
+
214
+ def repl(m):
215
+ key = f'ZZOCRGAP{counter[0]}ZZ'
216
+ counter[0] += 1
217
+ # Two newlines are a normal paragraph break; extras represent vertical spacing.
218
+ gaps[key] = max(1, len(m.group(0)) - 2)
219
+ return f'\n\n{key}\n\n'
220
+
221
+ return re.sub(r'\n{3,}', repl, text), gaps
222
+
223
+ def _restore_spatial_gap_placeholders(html: str, gaps: dict[str, int]) -> str:
224
+ if not gaps:
225
+ return html
226
+ for key, extra_lines in gaps.items():
227
+ gap_em = min(10.0, 0.9 * extra_lines)
228
+ block = f'<div class="ocr-gap" style="height:{gap_em:.2f}em"></div>'
229
+ html = html.replace(f'<p>{key}</p>', block)
230
+ html = html.replace(key, block)
231
+ return html
232
+
233
  def _to_mathml(latex: str, display: bool) -> str:
234
  """Convert a LaTeX string to MathML. Falls back to a code block on error."""
235
  # Fix OCR error: \frac{n/m} (single-argument fraction) → \frac{n}{m}
 
281
  text = re.sub(r'\\\[.*', '', text, flags=re.DOTALL)
282
  # Replace inline math \(...\) with placeholder tokens
283
  text = re.sub(r'\\\((.+?)\\\)', inline_math, text)
284
+ text, gaps = _inject_spatial_gap_placeholders(text)
285
 
286
  # Run markdown on text that now contains only safe placeholder tokens
287
  html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
 
312
  for key, literal in literals.items():
313
  html = html.replace(key, html_lib.escape(literal))
314
 
315
+ html = _restore_spatial_gap_placeholders(html, gaps)
316
+
317
  return f'<div class="math-preview">{html}</div>'
318
 
319
  def to_mathjax_html(text: str) -> str:
320
  """Render markdown to HTML and typeset math client-side with MathJax."""
321
  if not text:
322
  return ""
323
+ text, gaps = _inject_spatial_gap_placeholders(text)
324
  html = md_lib.markdown(text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
325
+ html = _restore_spatial_gap_placeholders(html, gaps)
326
+ return f'<div class="mathjax-preview">{html}</div>'
327
+
328
+ def _grounding_blocks_from_raw(raw_text: str):
329
+ if not raw_text:
330
+ return []
331
+
332
+ pattern = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
333
+ blocks = []
334
+ last_end = 0
335
+
336
+ for m in pattern.finditer(raw_text):
337
+ label = m.group(1).strip() or "text"
338
+ coord_text = m.group(2).strip()
339
+ text_chunk = raw_text[last_end:m.start()].strip()
340
+ last_end = m.end()
341
+
342
+ try:
343
+ coords = ast.literal_eval(coord_text)
344
+ except (SyntaxError, ValueError):
345
+ continue
346
+
347
+ if isinstance(coords, (tuple, list)) and coords and isinstance(coords[0], (int, float)):
348
+ coords = [coords]
349
+ if not isinstance(coords, list):
350
+ continue
351
+
352
+ boxes = [c for c in coords if isinstance(c, (list, tuple)) and len(c) >= 4]
353
+ if not boxes:
354
+ continue
355
+
356
+ x1 = max(0.0, min(float(c[0]) for c in boxes))
357
+ y1 = max(0.0, min(float(c[1]) for c in boxes))
358
+ x2 = min(999.0, max(float(c[2]) for c in boxes))
359
+ y2 = min(999.0, max(float(c[3]) for c in boxes))
360
+ if x2 <= x1 or y2 <= y1:
361
+ continue
362
+
363
+ blocks.append({
364
+ "label": label,
365
+ "text": text_chunk,
366
+ "x1": x1,
367
+ "y1": y1,
368
+ "x2": x2,
369
+ "y2": y2,
370
+ })
371
+
372
+ return blocks
373
+
374
+ def to_spatial_html(raw_text: str, markdown_text: str) -> str:
375
+ """Render OCR content using grounding boxes for spatially-positioned blocks."""
376
+ blocks = _grounding_blocks_from_raw(raw_text)
377
+ if not blocks:
378
+ return to_mathjax_html(markdown_text)
379
+
380
+ used_text = 0
381
+ rendered = []
382
+ palette = {
383
+ "title": "#8b5cf6",
384
+ "text": "#2563eb",
385
+ "image": "#059669",
386
+ "table": "#d97706",
387
+ "formula": "#dc2626",
388
+ }
389
+
390
+ for i, b in enumerate(sorted(blocks, key=lambda x: (x["y1"], x["x1"]))):
391
+ label = b["label"]
392
+ color = palette.get(label.lower(), "#4b5563")
393
+ body = b["text"].strip()
394
+ if body:
395
+ used_text += len(body)
396
+ body_text, gaps = _inject_spatial_gap_placeholders(body)
397
+ body_html = md_lib.markdown(body_text, extensions=['tables', 'fenced_code', 'sane_lists', 'nl2br'])
398
+ body_html = _restore_spatial_gap_placeholders(body_html, gaps)
399
+ else:
400
+ body_html = ""
401
+
402
+ if not body_html:
403
+ body_html = f"<p><em>{html_lib.escape(label)}</em></p>"
404
+
405
+ left = b["x1"] / 999.0 * 100.0
406
+ top = b["y1"] / 999.0 * 100.0
407
+ width = max(1.0, (b["x2"] - b["x1"]) / 999.0 * 100.0)
408
+ height = max(1.2, (b["y2"] - b["y1"]) / 999.0 * 100.0)
409
+
410
+ rendered.append(
411
+ f"""
412
+ <article class="spatial-block" style="left:{left:.2f}%; top:{top:.2f}%; width:{width:.2f}%; min-height:{height:.2f}%; --block-color:{color};">
413
+ <header>{html_lib.escape(label)}</header>
414
+ <section>{body_html}</section>
415
+ </article>
416
+ """
417
+ )
418
+
419
+ fallback = ""
420
+ if markdown_text and used_text < max(120, int(len(markdown_text) * 0.4)):
421
+ fallback_html = to_mathjax_html(markdown_text)
422
+ fallback = f"""
423
+ <details class="spatial-fallback">
424
+ <summary>Show full linear markdown rendering</summary>
425
+ {fallback_html}
426
+ </details>
427
+ """
428
+
429
+ return f"""
430
+ <style>
431
+ .spatial-preview {{
432
+ padding: 1rem;
433
+ }}
434
+ .spatial-canvas {{
435
+ position: relative;
436
+ width: 100%;
437
+ min-height: 72vh;
438
+ aspect-ratio: 1 / 1.35;
439
+ background: linear-gradient(180deg, #fcfdff 0%, #f7f9fc 100%);
440
+ border: 1px solid #d8dee9;
441
+ border-radius: 8px;
442
+ overflow: auto;
443
+ }}
444
+ .spatial-block {{
445
+ position: absolute;
446
+ box-sizing: border-box;
447
+ border: 1px solid var(--block-color);
448
+ background: color-mix(in srgb, var(--block-color) 7%, white);
449
+ border-radius: 6px;
450
+ padding: 0.35rem 0.5rem;
451
+ overflow: hidden;
452
+ }}
453
+ .spatial-block > header {{
454
+ font-size: 11px;
455
+ font-weight: 700;
456
+ letter-spacing: 0.03em;
457
+ text-transform: uppercase;
458
+ color: var(--block-color);
459
+ margin-bottom: 0.25rem;
460
+ }}
461
+ .spatial-block > section {{
462
+ font-size: 13px;
463
+ line-height: 1.35;
464
+ }}
465
+ .spatial-block p {{ margin: 0.2rem 0; }}
466
+ .spatial-fallback {{
467
+ margin-top: 1rem;
468
+ padding-top: 0.5rem;
469
+ border-top: 1px solid #d8dee9;
470
+ }}
471
+ </style>
472
+ <div class="spatial-preview mathjax-preview">
473
+ <div class="spatial-canvas">
474
+ {''.join(rendered)}
475
+ </div>
476
+ {fallback}
477
+ </div>
478
+ """
479
 
480
  def embed_images(markdown, crops):
481
  if not crops:
 
647
  with gr.Tab("HTML + MathJax", id="tab_html"):
648
  html_out = gr.HTML("")
649
  html_source_out = gr.Code(label="Generated HTML Source", language="html", lines=16)
650
+ with gr.Tab("Spatial HTML", id="tab_spatial"):
651
+ spatial_out = gr.HTML("")
652
+ spatial_source_out = gr.Code(label="Spatial HTML Source", language="html", lines=16)
653
  with gr.Tab("Boxes", id="tab_boxes"):
654
  img_out = gr.Image(type="pil", height=500, show_label=False)
655
  with gr.Tab("Cropped Images", id="tab_crops"):
 
709
  elif image is not None:
710
  cleaned, markdown, raw, img_out, crops = process_image(image, task, custom_prompt)
711
  else:
712
+ return "Error: Upload a file or image", "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False)
713
 
714
  # Text tab: convert \[...\] → $$...$$ and \(...\) → $...$ for readability
715
  text_display = re.sub(r'\\\[(.+?)\\\]',
 
723
  dl_tmp.close()
724
 
725
  mathjax_html = to_mathjax_html(markdown)
726
+ spatial_html = to_spatial_html(raw, markdown)
727
 
728
+ return (text_display, to_math_html(markdown), mathjax_html, mathjax_html, spatial_html, spatial_html, raw, img_out, crops,
729
  gr.DownloadButton(value=dl_tmp.name, visible=True))
730
 
731
  submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
732
+ [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn])
733
  submit_event.then(select_boxes, [task], [tabs])
734
 
735
  if __name__ == "__main__":