Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

ricklon commited on 11 days ago

Commit

d6afca6

1 Parent(s): f76cb58

Add region OCR UI and clarify Free OCR vs Locate behavior

Browse files

Files changed (1) hide show

app.py +52 -2

app.py CHANGED Viewed

@@ -18,6 +18,8 @@ import latex2mathml.converter
 from io import StringIO, BytesIO
 # Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
 #
 # Full precision BF16 (~8GB VRAM) — original, highest accuracy
@@ -745,6 +747,27 @@ def process_file(path, task, custom_prompt, page_num):
     else:
         return process_image(Image.open(path), task, custom_prompt)
 def toggle_prompt(task):
     if task == "✏️ Custom":
         return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
@@ -798,6 +821,8 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
     """)
     with gr.Row():
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
@@ -806,6 +831,16 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
             with gr.Tabs() as tabs:
@@ -825,6 +860,9 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
                     gallery = gr.Gallery(show_label=False, columns=3, height=400)
                 with gr.Tab("Raw Text", id="tab_raw"):
                     raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
             download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
     with gr.Accordion("Image Examples", open=True):
@@ -854,10 +892,17 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
         ### Tasks
         - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
-        - **Free OCR**: Simple text extraction without layout
-        - **Locate**: Find and highlight specific text/elements in image (grounding ✅)
         - **Describe**: General image description
         - **Custom**: Your own prompt
         ### Special Tokens
         - `<image>` - Placeholder where visual tokens are inserted
@@ -871,6 +916,11 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     page_selector.change(load_image, [file_in, page_selector], [input_img])
     task.change(toggle_prompt, [task], [prompt])
     task.change(select_boxes, [task], [tabs])
     def run(image, file_path, task, custom_prompt, page_num):
         if file_path:

 from io import StringIO, BytesIO
+HAS_IMAGE_EDITOR = hasattr(gr, "ImageEditor")
 # Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
 #
 # Full precision BF16 (~8GB VRAM) — original, highest accuracy
     else:
         return process_image(Image.open(path), task, custom_prompt)
+def _extract_editor_image(editor_value):
+    if editor_value is None:
+        return None
+    if isinstance(editor_value, Image.Image):
+        return editor_value
+    if isinstance(editor_value, dict):
+        composite = editor_value.get("composite")
+        if isinstance(composite, Image.Image):
+            return composite
+        background = editor_value.get("background")
+        if isinstance(background, Image.Image):
+            return background
+    return None
+def process_region_ocr(editor_value):
+    image = _extract_editor_image(editor_value)
+    if image is None:
+        return "Draw/crop a region first, then click OCR Region.", ""
+    text, markdown, _, _, _ = process_image(image, "📝 Free OCR", "")
+    return text, to_math_html(markdown)
 def toggle_prompt(task):
     if task == "✏️ Custom":
         return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
     This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
     """)
+    region_editor = None
+    region_btn = None
     with gr.Row():
         with gr.Column(scale=1):
             file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
+            with gr.Accordion("Region OCR (Draw/Crop)", open=False):
+                if HAS_IMAGE_EDITOR:
+                    region_editor = gr.ImageEditor(
+                        label="Draw a box and crop to the target area, then click OCR Region",
+                        type="pil",
+                        height=300,
+                    )
+                    region_btn = gr.Button("OCR Region", variant="secondary")
+                else:
+                    gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
         with gr.Column(scale=2):
             with gr.Tabs() as tabs:
                     gallery = gr.Gallery(show_label=False, columns=3, height=400)
                 with gr.Tab("Raw Text", id="tab_raw"):
                     raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
+                with gr.Tab("Region OCR", id="tab_region"):
+                    region_text_out = gr.Textbox(lines=12, buttons=["copy"], label="Region OCR Text")
+                    region_html_out = gr.HTML("")
             download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
     with gr.Accordion("Image Examples", open=True):
         ### Tasks
         - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
+        - **Free OCR**: Read all visible text from the full page/image (no boxes, no targeting)
+        - **Locate**: Find and highlight where specific text appears (grounding ✅)
         - **Describe**: General image description
         - **Custom**: Your own prompt
+        - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
+        ### Free OCR vs Locate (important)
+        - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
+        - If you want OCR for one area only, crop that area first, then run **Free OCR** on the cropped image.
+        - If you want to keep the full page but highlight where text appears, use **Locate** and enter the text to search.
+        - For advanced region workflows, use **Custom** with `<|grounding|>` in the prompt.
         ### Special Tokens
         - `<image>` - Placeholder where visual tokens are inserted
     page_selector.change(load_image, [file_in, page_selector], [input_img])
     task.change(toggle_prompt, [task], [prompt])
     task.change(select_boxes, [task], [tabs])
+    if HAS_IMAGE_EDITOR and region_editor is not None and region_btn is not None:
+        file_in.change(load_image, [file_in, page_selector], [region_editor])
+        page_selector.change(load_image, [file_in, page_selector], [region_editor])
+        input_img.change(lambda img: img, [input_img], [region_editor])
+        region_btn.click(process_region_ocr, [region_editor], [region_text_out, region_html_out])
     def run(image, file_path, task, custom_prompt, page_num):
         if file_path: