Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

ricklon commited on 7 days ago

Commit

f76cb58

1 Parent(s): 474fd39

Refine equation grounding with zoom-in pass and per-box spatial blocks

Browse files

Files changed (2) hide show

app.py +241 -72
tests/test_spatial_blocks.py +56 -0

app.py CHANGED Viewed

@@ -44,6 +44,19 @@ model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation=_attn_impl, t
 BASE_SIZE = 1024
 IMAGE_SIZE = 768
 CROP_MODE = True
 TASK_PROMPTS = {
     "📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
@@ -54,8 +67,112 @@ TASK_PROMPTS = {
 }
 def extract_grounding_references(text):
-    pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
-    return re.findall(pattern, text, re.DOTALL)
 def draw_bounding_boxes(image, refs, extract_images=False):
     img_w, img_h = image.size
@@ -75,7 +192,7 @@ def draw_bounding_boxes(image, refs, extract_images=False):
             color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
         color = color_map[label]
-        coords = eval(ref[2])
         color_a = color + (60,)
         for box in coords:
@@ -326,48 +443,20 @@ def to_mathjax_html(text: str) -> str:
     return f'<div class="mathjax-preview">{html}</div>'
 def _grounding_blocks_from_raw(raw_text: str):
-    if not raw_text:
-        return []
-    pattern = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
     blocks = []
-    last_end = 0
-    for m in pattern.finditer(raw_text):
-        label = m.group(1).strip() or "text"
-        coord_text = m.group(2).strip()
-        text_chunk = raw_text[last_end:m.start()].strip()
-        last_end = m.end()
-        try:
-            coords = ast.literal_eval(coord_text)
-        except (SyntaxError, ValueError):
-            continue
-        if isinstance(coords, (tuple, list)) and coords and isinstance(coords[0], (int, float)):
-            coords = [coords]
-        if not isinstance(coords, list):
-            continue
-        boxes = [c for c in coords if isinstance(c, (list, tuple)) and len(c) >= 4]
-        if not boxes:
-            continue
-        x1 = max(0.0, min(float(c[0]) for c in boxes))
-        y1 = max(0.0, min(float(c[1]) for c in boxes))
-        x2 = min(999.0, max(float(c[2]) for c in boxes))
-        y2 = min(999.0, max(float(c[3]) for c in boxes))
-        if x2 <= x1 or y2 <= y1:
-            continue
-        blocks.append({
-            "label": label,
-            "text": text_chunk,
-            "x1": x1,
-            "y1": y1,
-            "x2": x2,
-            "y2": y2,
-        })
     return blocks
@@ -487,6 +576,106 @@ def embed_images(markdown, crops):
         markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
     return markdown
 @spaces.GPU(duration=90)
 def process_image(image, task, custom_prompt):
     model.cuda()  # GPU is available here — works on ZeroGPU and locally
@@ -508,33 +697,7 @@ def process_image(image, task, custom_prompt):
     else:
         prompt = TASK_PROMPTS[task]["prompt"]
         has_grounding = TASK_PROMPTS[task]["has_grounding"]
-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
-    image.save(tmp.name, 'JPEG', quality=95)
-    tmp.close()
-    out_dir = tempfile.mkdtemp()
-    stdout = sys.stdout
-    sys.stdout = StringIO()
-    model.infer(
-        tokenizer=tokenizer,
-        prompt=prompt,
-        image_file=tmp.name,
-        output_path=out_dir,
-        base_size=BASE_SIZE,
-        image_size=IMAGE_SIZE,
-        crop_mode=CROP_MODE,
-        save_results=False
-    )
-    debug_filters = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
-    result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
-                        if l.strip() and not any(s in l for s in debug_filters)]).strip()
-    sys.stdout = stdout
-    os.unlink(tmp.name)
-    shutil.rmtree(out_dir, ignore_errors=True)
     if not result:
         return "No text detected", "", "", None, []
@@ -544,15 +707,21 @@ def process_image(image, task, custom_prompt):
     img_out = None
     crops = []
     if has_grounding and '<|ref|>' in result:
         refs = extract_grounding_references(result)
         if refs:
             img_out, crops = draw_bounding_boxes(image, refs, True)
     markdown = embed_images(markdown, crops)
-    return cleaned, markdown, result, img_out, crops
 @spaces.GPU(duration=90)
 def process_pdf(path, task, custom_prompt, page_num):

 BASE_SIZE = 1024
 IMAGE_SIZE = 768
 CROP_MODE = True
+GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
+INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
+EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
+EQUATION_ZOOM_MAX_CANDIDATES = 6
+EQUATION_ZOOM_MIN_AREA = 0.05
+EQUATION_ZOOM_MIN_DIM = 0.24
+EQUATION_ZOOM_PADDING = 0.025
+EQUATION_ZOOM_MAX_ASPECT = 12.0
+EQUATION_DETAIL_MAX_BOXES = 24
+EQUATION_DETAIL_IOU_DEDUPE = 0.7
+MATH_LABEL_HINTS = ("formula", "equation", "math")
+MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
+MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
 TASK_PROMPTS = {
     "📋 Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
 }
 def extract_grounding_references(text):
+    refs = []
+    for entry in _extract_grounding_entries(text):
+        coord_text = repr(entry["coords"])
+        raw = f'<|ref|>{entry["label"]}<|/ref|><|det|>{coord_text}<|/det|>'
+        refs.append((raw, entry["label"], coord_text))
+    return refs
+def _parse_coord_payload(payload):
+    if isinstance(payload, str):
+        try:
+            coords = ast.literal_eval(payload.strip())
+        except (SyntaxError, ValueError):
+            return []
+    else:
+        coords = payload
+    if isinstance(coords, (tuple, list)) and coords and isinstance(coords[0], (int, float)):
+        coords = [coords]
+    if not isinstance(coords, list):
+        return []
+    out = []
+    for c in coords:
+        if not isinstance(c, (list, tuple)) or len(c) < 4:
+            continue
+        x1, y1, x2, y2 = [float(v) for v in c[:4]]
+        x1, x2 = sorted((max(0.0, min(999.0, x1)), max(0.0, min(999.0, x2))))
+        y1, y2 = sorted((max(0.0, min(999.0, y1)), max(0.0, min(999.0, y2))))
+        if x2 <= x1 or y2 <= y1:
+            continue
+        out.append([x1, y1, x2, y2])
+    return out
+def _extract_grounding_entries(raw_text: str):
+    if not raw_text:
+        return []
+    entries = []
+    last_end = 0
+    for m in GROUNDING_PATTERN.finditer(raw_text):
+        label = m.group(1).strip() or "text"
+        coords = _parse_coord_payload(m.group(2))
+        if not coords:
+            continue
+        text_chunk = raw_text[last_end:m.start()].strip()
+        entries.append({
+            "label": label,
+            "coords": coords,
+            "text": text_chunk,
+        })
+        last_end = m.end()
+    return entries
+def _math_marker_score(text_chunk: str) -> int:
+    score = 0
+    for marker in MATH_STRONG_MARKERS:
+        if marker in text_chunk:
+            score += 3
+    for marker in MATH_WEAK_MARKERS:
+        if marker in text_chunk:
+            score += 1
+    return score
+def _box_iou(a, b):
+    ax1, ay1, ax2, ay2 = a
+    bx1, by1, bx2, by2 = b
+    inter_x1 = max(ax1, bx1)
+    inter_y1 = max(ay1, by1)
+    inter_x2 = min(ax2, bx2)
+    inter_y2 = min(ay2, by2)
+    if inter_x2 <= inter_x1 or inter_y2 <= inter_y1:
+        return 0.0
+    inter = (inter_x2 - inter_x1) * (inter_y2 - inter_y1)
+    area_a = max(1e-9, (ax2 - ax1) * (ay2 - ay1))
+    area_b = max(1e-9, (bx2 - bx1) * (by2 - by1))
+    union = area_a + area_b - inter
+    return inter / union if union > 0 else 0.0
+def _dedupe_boxes(boxes, iou_threshold):
+    kept = []
+    for box in sorted(boxes, key=lambda b: ((b[2] - b[0]) * (b[3] - b[1]))):
+        if any(_box_iou(box, other) >= iou_threshold for other in kept):
+            continue
+        kept.append(box)
+    return kept
+def _is_math_candidate(label: str, text_chunk: str, box):
+    label_l = label.lower()
+    box_w = (box[2] - box[0]) / 999.0
+    box_h = (box[3] - box[1]) / 999.0
+    area = box_w * box_h
+    aspect = max(box_w / max(1e-9, box_h), box_h / max(1e-9, box_w))
+    has_math_label = any(hint in label_l for hint in MATH_LABEL_HINTS)
+    has_math_text = _math_marker_score(text_chunk) >= 3
+    is_large = area >= EQUATION_ZOOM_MIN_AREA or box_w >= EQUATION_ZOOM_MIN_DIM or box_h >= EQUATION_ZOOM_MIN_DIM
+    return (has_math_label or has_math_text) and is_large and aspect <= EQUATION_ZOOM_MAX_ASPECT
+def _map_crop_box_to_page(sub_box, crop_px, img_w, img_h):
+    crop_x1, crop_y1, crop_x2, crop_y2 = crop_px
+    crop_w = max(1, crop_x2 - crop_x1)
+    crop_h = max(1, crop_y2 - crop_y1)
+    page_x1 = ((crop_x1 + (sub_box[0] / 999.0) * crop_w) / img_w) * 999.0
+    page_y1 = ((crop_y1 + (sub_box[1] / 999.0) * crop_h) / img_h) * 999.0
+    page_x2 = ((crop_x1 + (sub_box[2] / 999.0) * crop_w) / img_w) * 999.0
+    page_y2 = ((crop_y1 + (sub_box[3] / 999.0) * crop_h) / img_h) * 999.0
+    return _parse_coord_payload([[page_x1, page_y1, page_x2, page_y2]])[0]
 def draw_bounding_boxes(image, refs, extract_images=False):
     img_w, img_h = image.size
             color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
         color = color_map[label]
+        coords = _parse_coord_payload(ref[2])
         color_a = color + (60,)
         for box in coords:
     return f'<div class="mathjax-preview">{html}</div>'
 def _grounding_blocks_from_raw(raw_text: str):
     blocks = []
+    for entry in _extract_grounding_entries(raw_text):
+        label = entry["label"]
+        text = entry["text"].strip()
+        coords = entry["coords"]
+        for idx, c in enumerate(coords):
+            blocks.append({
+                "label": label,
+                "text": text if idx == 0 else "",
+                "x1": c[0],
+                "y1": c[1],
+                "x2": c[2],
+                "y2": c[3],
+            })
     return blocks
         markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
     return markdown
+def _infer_with_prompt(image, prompt):
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
+    image.save(tmp.name, 'JPEG', quality=95)
+    tmp.close()
+    out_dir = tempfile.mkdtemp()
+    stdout = sys.stdout
+    capture = StringIO()
+    sys.stdout = capture
+    try:
+        model.infer(
+            tokenizer=tokenizer,
+            prompt=prompt,
+            image_file=tmp.name,
+            output_path=out_dir,
+            base_size=BASE_SIZE,
+            image_size=IMAGE_SIZE,
+            crop_mode=CROP_MODE,
+            save_results=False
+        )
+    finally:
+        sys.stdout = stdout
+        os.unlink(tmp.name)
+        shutil.rmtree(out_dir, ignore_errors=True)
+    lines = [
+        l for l in capture.getvalue().split('\n')
+        if l.strip() and not any(s in l for s in INFER_DEBUG_FILTERS)
+    ]
+    return '\n'.join(lines).strip()
+def _refine_equation_refs(image, raw_text):
+    entries = _extract_grounding_entries(raw_text)
+    if not entries:
+        return []
+    img_w, img_h = image.size
+    candidates = []
+    for entry in entries:
+        for box in entry["coords"]:
+            if _is_math_candidate(entry["label"], entry["text"], box):
+                area = (box[2] - box[0]) * (box[3] - box[1])
+                candidates.append((area, entry, box))
+    if not candidates:
+        return []
+    candidates.sort(key=lambda x: x[0], reverse=True)
+    refined_refs = []
+    for _, entry, box in candidates[:EQUATION_ZOOM_MAX_CANDIDATES]:
+        x1 = int(box[0] / 999.0 * img_w)
+        y1 = int(box[1] / 999.0 * img_h)
+        x2 = int(box[2] / 999.0 * img_w)
+        y2 = int(box[3] / 999.0 * img_h)
+        box_w = max(1, x2 - x1)
+        box_h = max(1, y2 - y1)
+        pad_x = max(8, int(box_w * EQUATION_ZOOM_PADDING))
+        pad_y = max(8, int(box_h * EQUATION_ZOOM_PADDING))
+        crop_x1 = max(0, x1 - pad_x)
+        crop_y1 = max(0, y1 - pad_y)
+        crop_x2 = min(img_w, x2 + pad_x)
+        crop_y2 = min(img_h, y2 + pad_y)
+        if crop_x2 - crop_x1 < 32 or crop_y2 - crop_y1 < 32:
+            continue
+        crop = image.crop((crop_x1, crop_y1, crop_x2, crop_y2))
+        sub_result = _infer_with_prompt(crop, EQUATION_ZOOM_PROMPT)
+        sub_entries = _extract_grounding_entries(sub_result)
+        if not sub_entries:
+            continue
+        mapped_boxes = []
+        for sub in sub_entries:
+            sub_label = sub["label"].lower()
+            sub_text = sub["text"]
+            is_math_sub = any(hint in sub_label for hint in MATH_LABEL_HINTS) or _math_marker_score(sub_text) >= 3
+            if sub_label in ("image", "table") or not is_math_sub:
+                continue
+            for sub_box in sub["coords"]:
+                mapped = _map_crop_box_to_page(sub_box, (crop_x1, crop_y1, crop_x2, crop_y2), img_w, img_h)
+                w = (mapped[2] - mapped[0]) / 999.0
+                h = (mapped[3] - mapped[1]) / 999.0
+                if w * h < 0.0004:
+                    continue
+                mapped_boxes.append(mapped)
+        if not mapped_boxes:
+            continue
+        mapped_boxes = _dedupe_boxes(mapped_boxes, EQUATION_DETAIL_IOU_DEDUPE)
+        mapped_boxes = sorted(mapped_boxes, key=lambda b: (b[1], b[0]))[:EQUATION_DETAIL_MAX_BOXES]
+        if len(mapped_boxes) < 2:
+            continue
+        merged_text = repr(mapped_boxes)
+        label = "equation_detail"
+        raw = f'<|ref|>{label}<|/ref|><|det|>{merged_text}<|/det|>'
+        refined_refs.append((raw, label, merged_text))
+    return refined_refs
 @spaces.GPU(duration=90)
 def process_image(image, task, custom_prompt):
     model.cuda()  # GPU is available here — works on ZeroGPU and locally
     else:
         prompt = TASK_PROMPTS[task]["prompt"]
         has_grounding = TASK_PROMPTS[task]["has_grounding"]
+    result = _infer_with_prompt(image, prompt)
     if not result:
         return "No text detected", "", "", None, []
     img_out = None
     crops = []
+    result_for_layout = result
     if has_grounding and '<|ref|>' in result:
         refs = extract_grounding_references(result)
+        if task == "📋 Markdown":
+            refs.extend(_refine_equation_refs(image, result))
         if refs:
             img_out, crops = draw_bounding_boxes(image, refs, True)
+            synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
+            if synthetic:
+                result_for_layout = result + "\n" + "\n".join(synthetic)
     markdown = embed_images(markdown, crops)
+    return cleaned, markdown, result_for_layout, img_out, crops
 @spaces.GPU(duration=90)
 def process_pdf(path, task, custom_prompt, page_num):

tests/test_spatial_blocks.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import ast
+import pathlib
+import re
+import unittest
+def _load_grounding_blocks():
+    app_path = pathlib.Path(__file__).resolve().parents[1] / "app.py"
+    source = app_path.read_text(encoding="utf-8")
+    module = ast.parse(source, filename=str(app_path))
+    wanted = {
+        "_parse_coord_payload",
+        "_extract_grounding_entries",
+        "_grounding_blocks_from_raw",
+    }
+    fn_nodes = [n for n in module.body if isinstance(n, ast.FunctionDef) and n.name in wanted]
+    fn_nodes.sort(key=lambda n: n.lineno)
+    test_mod = ast.Module(body=fn_nodes, type_ignores=[])
+    code = compile(test_mod, filename=str(app_path), mode="exec")
+    scope = {
+        "ast": ast,
+        "re": re,
+        "GROUNDING_PATTERN": re.compile(r"<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>", re.DOTALL),
+    }
+    exec(code, scope)
+    return scope["_grounding_blocks_from_raw"]
+class SpatialBlockTests(unittest.TestCase):
+    def test_multi_coord_refs_render_as_multiple_blocks(self):
+        grounding_blocks = _load_grounding_blocks()
+        raw = (
+            "Equation cluster\n"
+            "<|ref|>formula<|/ref|><|det|>[[100,100,600,220],[100,240,600,360]]<|/det|>\n"
+            "Trailing text\n"
+            "<|ref|>text<|/ref|><|det|>[[40,400,700,520]]<|/det|>"
+        )
+        blocks = grounding_blocks(raw)
+        self.assertEqual(3, len(blocks))
+        formula_blocks = [b for b in blocks if b["label"] == "formula"]
+        self.assertEqual(2, len(formula_blocks))
+        self.assertEqual(100.0, formula_blocks[0]["x1"])
+        self.assertEqual(600.0, formula_blocks[0]["x2"])
+        self.assertEqual(100.0, formula_blocks[1]["x1"])
+        self.assertEqual(600.0, formula_blocks[1]["x2"])
+        self.assertEqual("Equation cluster", formula_blocks[0]["text"])
+        self.assertEqual("", formula_blocks[1]["text"])
+if __name__ == "__main__":
+    unittest.main()