Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

ricklon commited on Mar 5

Commit

930195f

1 Parent(s): 3affffd

Label selected regions across boxes and cropped outputs

Browse files

Files changed (1) hide show

app.py +197 -69

app.py CHANGED Viewed

@@ -804,7 +804,66 @@ def _to_rgba_image(obj):
         return Image.fromarray(arr.astype(np.uint8), mode="RGBA")
     return None
-def _extract_selected_region(editor_value, base_size=None):
     """Extract a clean selected region from ImageEditor data.
     Strategy:
@@ -815,10 +874,11 @@ def _extract_selected_region(editor_value, base_size=None):
         return None
     if isinstance(editor_value, Image.Image):
         if base_size and tuple(editor_value.size) == tuple(base_size):
-            return None
-        return editor_value
     if not isinstance(editor_value, dict):
-        return None
     background = _to_rgba_image(editor_value.get("background"))
     composite = _to_rgba_image(editor_value.get("composite"))
@@ -826,14 +886,16 @@ def _extract_selected_region(editor_value, base_size=None):
     if background is None:
         if composite is None:
-            return None
         background = composite
     if not isinstance(layers, list) or not layers:
         # No annotation layers; treat as explicit crop only if size changed from base.
         if base_size and tuple(background.size) == tuple(base_size):
-            return None
-        return background.convert("RGB")
     alpha_acc = np.zeros((background.height, background.width), dtype=np.uint8)
     for layer in layers:
@@ -848,7 +910,7 @@ def _extract_selected_region(editor_value, base_size=None):
     ys, xs = np.where(alpha_acc > 0)
     if xs.size == 0 or ys.size == 0:
-        return None
     x1, y1 = int(xs.min()), int(ys.min())
     x2, y2 = int(xs.max()) + 1, int(ys.max()) + 1
@@ -859,9 +921,46 @@ def _extract_selected_region(editor_value, base_size=None):
     x2 = min(background.width, x2 + pad_x)
     y2 = min(background.height, y2 + pad_y)
     if x2 <= x1 or y2 <= y1:
-        return None
-    return background.crop((x1, y1, x2, y2)).convert("RGB")
 def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
     text_display = re.sub(
@@ -877,16 +976,9 @@ def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
     dl_tmp.close()
     markdown_html = to_math_html(markdown)
-    mathjax_html = to_mathjax_html(markdown)
-    spatial_html = to_spatial_html(raw, markdown)
     return (
         text_display,
         markdown_html,
-        mathjax_html,
-        mathjax_html,
-        spatial_html,
-        spatial_html,
         raw,
         img_out,
         gallery_items,
@@ -930,8 +1022,8 @@ def load_image(file_path, page_num=1):
 def load_image_with_size(file_path, page_num=1):
     img = load_image(file_path, page_num)
     if img is None:
-        return None, None
-    return img, (int(img.width), int(img.height))
 def update_page_selector(file_path):
     if not file_path:
@@ -954,25 +1046,12 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     region_editor = None
     workspace_base_size = gr.State(None)
     with gr.Row():
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
-            task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
-            input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")
-            equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
-            gr.Markdown(
-                """
-                **Quick use**
-                1. Load a page/image into the workspace below.
-                2. `Entire Page`: click **Extract**.
-                3. `Selected Region`: use the **Crop** tool for a rectangle selection, or draw/highlight freehand; then click **Extract**.
-                4. Freehand/highlight uses semi-transparent blue ink so text stays visible.
-                5. Check **Cropped Images** to confirm the selected region used for OCR.
-                """
-            )
-            prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
-            btn = gr.Button("Extract", variant="primary", size="lg")
             gr.Markdown("**Image Workspace (full page + region selection)**")
             if HAS_IMAGE_EDITOR:
                 editor_kwargs = {}
@@ -1006,6 +1085,17 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             else:
                 gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
                 region_editor = gr.State(None)
         with gr.Column(scale=2):
             with gr.Tabs() as tabs:
@@ -1013,12 +1103,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
                     text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
                 with gr.Tab("Markdown Preview", id="tab_markdown"):
                     md_out = gr.HTML("")
-                with gr.Tab("HTML + MathJax", id="tab_html"):
-                    html_out = gr.HTML("")
-                    html_source_out = gr.Code(label="Generated HTML Source", language="html", lines=16)
-                with gr.Tab("Spatial HTML", id="tab_spatial"):
-                    spatial_out = gr.HTML("")
-                    spatial_source_out = gr.Code(label="Spatial HTML Source", language="html", lines=16)
                 with gr.Tab("Boxes", id="tab_boxes"):
                     img_out = gr.Image(type="pil", height=500, show_label=False)
                 with gr.Tab("Cropped Images", id="tab_crops"):
@@ -1028,15 +1112,23 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
     with gr.Accordion("Image Examples", open=True):
-        gr.Examples(
-            examples=[
-                ["examples/2022-0922 Section 13 Notes.png", "📋 Markdown", ""],
-                ["examples/2022-0922 Section 14 Notes.png", "📋 Markdown", ""],
-                ["examples/2022-0922 Section 15 Notes.png", "📋 Markdown", ""],
-            ],
-            inputs=[file_in, task, prompt],
-            cache_examples=False
-        )
     with gr.Accordion("PDF Examples", open=True):
         gr.Examples(
@@ -1061,8 +1153,10 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
            - Rectangle selection: use the **Crop** tool.
            - Freehand selection: draw/highlight the target; app uses an automatic bounding box around your marks.
            - Freehand/highlight ink is semi-transparent so underlying content stays visible.
            Then click **Extract**.
-        4. Review **Cropped Images** to confirm the selected region used for OCR.
         ### Tasks
         - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
@@ -1091,24 +1185,58 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     task.change(toggle_prompt, [task], [prompt])
     task.change(select_boxes, [task], [tabs])
     if HAS_IMAGE_EDITOR and region_editor is not None:
-        file_in.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size])
-        page_selector.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size])
-    def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size):
-        selected_region = None
         if scope == "Selected Region":
-            selected_region = _extract_selected_region(region_value, base_size=base_size)
-            if selected_region is None:
-                msg = "Select Input Scope=Selected Region, then crop or annotate a target area in the Image Workspace first."
-                return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False))
-            cleaned, markdown, raw, img_out, crops = process_image(
-                selected_region,
-                task,
-                custom_prompt,
-                enable_equation_zoom=enable_equation_zoom,
-                infer_crop_mode=False,
-            )
-            crops = [selected_region] + (crops or [])
         elif (full_image := _extract_editor_background(region_value)) is not None:
             cleaned, markdown, raw, img_out, crops = process_image(
                 full_image,
@@ -1126,14 +1254,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             )
         else:
             msg = "Error: Upload a file or image"
-            return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False))
         return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
     submit_event = btn.click(
         run,
-        [file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size],
-        [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn]
     )
     submit_event.then(select_boxes, [task], [tabs])

         return Image.fromarray(arr.astype(np.uint8), mode="RGBA")
     return None
+def _locate_patch_bbox(base_image: Image.Image, patch_image: Image.Image):
+    """Approximate patch location in base image using downscaled SSD search."""
+    if base_image is None or patch_image is None:
+        return None
+    base = np.asarray(base_image.convert("L"), dtype=np.float32)
+    patch = np.asarray(patch_image.convert("L"), dtype=np.float32)
+    bh, bw = base.shape[:2]
+    ph, pw = patch.shape[:2]
+    if ph <= 0 or pw <= 0 or ph > bh or pw > bw:
+        return None
+    max_dim = max(bh, bw)
+    scale = min(1.0, 320.0 / max_dim) if max_dim > 0 else 1.0
+    if scale < 1.0:
+        new_bw = max(1, int(round(bw * scale)))
+        new_bh = max(1, int(round(bh * scale)))
+        new_pw = max(1, int(round(pw * scale)))
+        new_ph = max(1, int(round(ph * scale)))
+        base_small = np.asarray(Image.fromarray(base.astype(np.uint8)).resize((new_bw, new_bh), Image.Resampling.BILINEAR), dtype=np.float32)
+        patch_small = np.asarray(Image.fromarray(patch.astype(np.uint8)).resize((new_pw, new_ph), Image.Resampling.BILINEAR), dtype=np.float32)
+    else:
+        base_small = base
+        patch_small = patch
+    sbh, sbw = base_small.shape
+    sph, spw = patch_small.shape
+    if sph > sbh or spw > sbw:
+        return None
+    best_score = float("inf")
+    best_x = 0
+    best_y = 0
+    for y in range(sbh - sph + 1):
+        row = base_small[y:y + sph, :]
+        windows = np.lib.stride_tricks.sliding_window_view(row, spw, axis=1)
+        # windows: (sph, sbw-spw+1, spw)
+        diff = windows - patch_small[:, None, :]
+        scores = np.mean(diff * diff, axis=(0, 2))
+        x = int(np.argmin(scores))
+        score = float(scores[x])
+        if score < best_score:
+            best_score = score
+            best_x = x
+            best_y = y
+    if scale < 1.0:
+        x1 = int(round(best_x / scale))
+        y1 = int(round(best_y / scale))
+        x2 = int(round((best_x + spw) / scale))
+        y2 = int(round((best_y + sph) / scale))
+    else:
+        x1, y1, x2, y2 = best_x, best_y, best_x + spw, best_y + sph
+    x1 = max(0, min(bw - 1, x1))
+    y1 = max(0, min(bh - 1, y1))
+    x2 = max(x1 + 1, min(bw, x2))
+    y2 = max(y1 + 1, min(bh, y2))
+    return (x1, y1, x2, y2)
+def _extract_selected_region(editor_value, base_size=None, base_image=None):
     """Extract a clean selected region from ImageEditor data.
     Strategy:
         return None
     if isinstance(editor_value, Image.Image):
         if base_size and tuple(editor_value.size) == tuple(base_size):
+            return None, None
+        bbox = _locate_patch_bbox(base_image, editor_value) if base_image is not None else None
+        return editor_value, bbox
     if not isinstance(editor_value, dict):
+        return None, None
     background = _to_rgba_image(editor_value.get("background"))
     composite = _to_rgba_image(editor_value.get("composite"))
     if background is None:
         if composite is None:
+            return None, None
         background = composite
     if not isinstance(layers, list) or not layers:
         # No annotation layers; treat as explicit crop only if size changed from base.
         if base_size and tuple(background.size) == tuple(base_size):
+            return None, None
+        patch = background.convert("RGB")
+        bbox = _locate_patch_bbox(base_image, patch) if base_image is not None else None
+        return patch, bbox
     alpha_acc = np.zeros((background.height, background.width), dtype=np.uint8)
     for layer in layers:
     ys, xs = np.where(alpha_acc > 0)
     if xs.size == 0 or ys.size == 0:
+        return None, None
     x1, y1 = int(xs.min()), int(ys.min())
     x2, y2 = int(xs.max()) + 1, int(ys.max()) + 1
     x2 = min(background.width, x2 + pad_x)
     y2 = min(background.height, y2 + pad_y)
     if x2 <= x1 or y2 <= y1:
+        return None, None
+    return background.crop((x1, y1, x2, y2)).convert("RGB"), (x1, y1, x2, y2)
+def _draw_selected_region_boxes(image, boxes):
+    if image is None or not boxes:
+        return None
+    refs = []
+    w, h = image.size
+    for i, b in enumerate(boxes, 1):
+        x1, y1, x2, y2 = b
+        nx1 = max(0.0, min(999.0, x1 / max(1, w) * 999.0))
+        ny1 = max(0.0, min(999.0, y1 / max(1, h) * 999.0))
+        nx2 = max(0.0, min(999.0, x2 / max(1, w) * 999.0))
+        ny2 = max(0.0, min(999.0, y2 / max(1, h) * 999.0))
+        label = f"Region {i}"
+        coord_text = repr([[nx1, ny1, nx2, ny2]])
+        raw = f'<|ref|>region_{i}<|/ref|><|det|>{coord_text}<|/det|>'
+        refs.append((raw, label, coord_text))
+    img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
+    return img_out
+def _region_gallery_items(regions):
+    return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
+def _reset_selected_regions():
+    return [], [], "No saved regions."
+def add_selected_region(editor_value, base_size, base_image, selected_regions):
+    region_img, bbox = _extract_selected_region(editor_value, base_size=base_size, base_image=base_image)
+    if region_img is None:
+        msg = "No region detected. Use Crop or draw/highlight a region first."
+        regions = selected_regions or []
+        return regions, _region_gallery_items(regions), msg
+    regions = list(selected_regions or [])
+    regions.append({"image": region_img, "bbox": bbox})
+    return regions, _region_gallery_items(regions), f"{len(regions)} region(s) saved."
+def clear_selected_regions():
+    return _reset_selected_regions()
 def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
     text_display = re.sub(
     dl_tmp.close()
     markdown_html = to_math_html(markdown)
     return (
         text_display,
         markdown_html,
         raw,
         img_out,
         gallery_items,
 def load_image_with_size(file_path, page_num=1):
     img = load_image(file_path, page_num)
     if img is None:
+        return None, None, None
+    return img, (int(img.width), int(img.height)), img
 def update_page_selector(file_path):
     if not file_path:
     region_editor = None
     workspace_base_size = gr.State(None)
+    workspace_base_image = gr.State(None)
+    selected_regions_state = gr.State([])
     with gr.Row():
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             gr.Markdown("**Image Workspace (full page + region selection)**")
             if HAS_IMAGE_EDITOR:
                 editor_kwargs = {}
             else:
                 gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
                 region_editor = gr.State(None)
+            input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")
+            selection_controls = gr.Row()
+            with selection_controls:
+                add_region_btn = gr.Button("Add Region", variant="secondary")
+                clear_regions_btn = gr.Button("Clear Regions")
+            selection_status = gr.Textbox(label="Region Selection Status", value="No saved regions.", interactive=False)
+            selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
+            task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
+            equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
+            prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
+            btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
             with gr.Tabs() as tabs:
                     text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
                 with gr.Tab("Markdown Preview", id="tab_markdown"):
                     md_out = gr.HTML("")
                 with gr.Tab("Boxes", id="tab_boxes"):
                     img_out = gr.Image(type="pil", height=500, show_label=False)
                 with gr.Tab("Cropped Images", id="tab_crops"):
             download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
     with gr.Accordion("Image Examples", open=True):
+        image_examples = [
+            ["examples/2022-0922 Section 13 Notes.png", "📋 Markdown", ""],
+            ["examples/2022-0922 Section 14 Notes.png", "📋 Markdown", ""],
+            ["examples/2022-0922 Section 15 Notes.png", "📋 Markdown", ""],
+        ]
+        if HAS_IMAGE_EDITOR and region_editor is not None:
+            gr.Examples(
+                examples=image_examples,
+                inputs=[region_editor, task, prompt],
+                cache_examples=False
+            )
+        else:
+            gr.Examples(
+                examples=image_examples,
+                inputs=[file_in, task, prompt],
+                cache_examples=False
+            )
     with gr.Accordion("PDF Examples", open=True):
         gr.Examples(
            - Rectangle selection: use the **Crop** tool.
            - Freehand selection: draw/highlight the target; app uses an automatic bounding box around your marks.
            - Freehand/highlight ink is semi-transparent so underlying content stays visible.
+           - Optional multi-select: click **Add Region** after each selection.
            Then click **Extract**.
+        4. Use **Clear Regions** to reset multi-select state.
+        5. Review **Cropped Images** and **Boxes**: both are labeled `Region 1`, `Region 2`, etc.
         ### Tasks
         - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
     task.change(toggle_prompt, [task], [prompt])
     task.change(select_boxes, [task], [tabs])
     if HAS_IMAGE_EDITOR and region_editor is not None:
+        file_in.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
+        page_selector.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
+        file_in.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
+        page_selector.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
+    add_region_btn.click(
+        add_selected_region,
+        [region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
+        [selected_regions_state, selected_regions_gallery, selection_status],
+    )
+    clear_regions_btn.click(
+        clear_selected_regions,
+        outputs=[selected_regions_state, selected_regions_gallery, selection_status],
+    )
+    def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size, base_image, selected_regions):
         if scope == "Selected Region":
+            regions = list(selected_regions or [])
+            if not regions:
+                selected_region, selected_bbox = _extract_selected_region(region_value, base_size=base_size, base_image=base_image)
+                if selected_region is None:
+                    msg = "Select Input Scope=Selected Region, then crop or annotate a target area in the Image Workspace first."
+                    return (msg, "", "", None, [], gr.DownloadButton(visible=False))
+                regions = [{"image": selected_region, "bbox": selected_bbox}]
+            cleaned_parts = []
+            markdown_parts = []
+            raw_parts = []
+            for i, r in enumerate(regions, 1):
+                cleaned_i, markdown_i, raw_i, _, _ = process_image(
+                    r["image"],
+                    task,
+                    custom_prompt,
+                    enable_equation_zoom=enable_equation_zoom,
+                    infer_crop_mode=False,
+                )
+                if len(regions) > 1:
+                    cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
+                    markdown_parts.append(f"## Region {i}\n\n{markdown_i}")
+                    raw_parts.append(f"## Region {i}\n\n{raw_i}")
+                else:
+                    cleaned_parts.append(cleaned_i)
+                    markdown_parts.append(markdown_i)
+                    raw_parts.append(raw_i)
+            cleaned = "\n\n".join(cleaned_parts).strip()
+            markdown = "\n\n".join(markdown_parts).strip()
+            raw = "\n\n".join(raw_parts).strip()
+            crops = _region_gallery_items(regions)
+            full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
+            region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
+            img_out = _draw_selected_region_boxes(full_img, region_boxes)
         elif (full_image := _extract_editor_background(region_value)) is not None:
             cleaned, markdown, raw, img_out, crops = process_image(
                 full_image,
             )
         else:
             msg = "Error: Upload a file or image"
+            return (msg, "", "", None, [], gr.DownloadButton(visible=False))
         return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
     submit_event = btn.click(
         run,
+        [file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
+        [text_out, md_out, raw_out, img_out, gallery, download_btn]
     )
     submit_event.then(select_boxes, [task], [tabs])