Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

ricklon commited on Mar 6

Commit

3edf7ce

1 Parent(s): 7e8815a

Populate full-page cropped gallery from detected refs

Browse files

Files changed (1) hide show

app.py +33 -3

app.py CHANGED Viewed

@@ -242,6 +242,32 @@ def draw_bounding_boxes(image, refs, extract_images=False):
     img_draw.paste(overlay, (0, 0), overlay)
     return img_draw, crops
 def clean_output(text, include_images=False):
     if not text:
         return ""
@@ -994,6 +1020,7 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
     img_out = None
     crops = []
     result_for_layout = result
     if has_grounding and '<|ref|>' in result:
@@ -1001,12 +1028,15 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
         if task == "📋 Markdown" and enable_equation_zoom:
             refs.extend(_refine_equation_refs(image, result))
         if refs:
-            img_out, crops = draw_bounding_boxes(image, refs, True)
             synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
             if synthetic:
                 result_for_layout = result + "\n" + "\n".join(synthetic)
-    markdown = embed_images(markdown, crops)
     return cleaned, markdown, result_for_layout, img_out, crops

     img_draw.paste(overlay, (0, 0), overlay)
     return img_draw, crops
+def _extract_labeled_crops_from_refs(image, refs, max_items=24):
+    img_w, img_h = image.size
+    items = []
+    seen = set()
+    for ref in refs:
+        label = str(ref[1])
+        coords = _parse_coord_payload(ref[2])
+        for box in coords:
+            x1 = int(box[0] / 999.0 * img_w)
+            y1 = int(box[1] / 999.0 * img_h)
+            x2 = int(box[2] / 999.0 * img_w)
+            y2 = int(box[3] / 999.0 * img_h)
+            if x2 - x1 < 8 or y2 - y1 < 8:
+                continue
+            key = (label.lower(), x1, y1, x2, y2)
+            if key in seen:
+                continue
+            seen.add(key)
+            crop = image.crop((x1, y1, x2, y2))
+            caption = f"{label} ({crop.width}x{crop.height})"
+            items.append((crop, caption))
+            if len(items) >= max_items:
+                return items
+    return items
 def clean_output(text, include_images=False):
     if not text:
         return ""
     img_out = None
     crops = []
+    figure_crops = []
     result_for_layout = result
     if has_grounding and '<|ref|>' in result:
         if task == "📋 Markdown" and enable_equation_zoom:
             refs.extend(_refine_equation_refs(image, result))
         if refs:
+            img_out, figure_crops = draw_bounding_boxes(image, refs, True)
+            crops = _extract_labeled_crops_from_refs(image, refs)
             synthetic = [r[0] for r in refs if r[1] == "equation_detail"]
             if synthetic:
                 result_for_layout = result + "\n" + "\n".join(synthetic)
+    markdown = embed_images(markdown, figure_crops)
+    if not crops and figure_crops:
+        crops = _label_gallery_items(figure_crops, prefix="Figure")
     return cleaned, markdown, result_for_layout, img_out, crops