Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

ricklon commited on 7 days ago

Commit

c0f56fe

1 Parent(s): 0da0e18

Make equation multipass optional and integrate region OCR across all outputs

Browse files

Files changed (1) hide show

app.py +129 -37

app.py CHANGED Viewed

@@ -578,7 +578,9 @@ def embed_images(markdown, crops):
         markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
     return markdown
-def _infer_with_prompt(image, prompt):
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
     image.save(tmp.name, 'JPEG', quality=95)
     tmp.close()
@@ -595,7 +597,7 @@ def _infer_with_prompt(image, prompt):
             output_path=out_dir,
             base_size=BASE_SIZE,
             image_size=IMAGE_SIZE,
-            crop_mode=CROP_MODE,
             save_results=False
         )
     finally:
@@ -679,7 +681,7 @@ def _refine_equation_refs(image, raw_text):
     return refined_refs
 @spaces.GPU(duration=90)
-def process_image(image, task, custom_prompt):
     model.cuda()  # GPU is available here — works on ZeroGPU and locally
     if image is None:
         return "Error: Upload an image", "", "", None, []
@@ -699,7 +701,7 @@ def process_image(image, task, custom_prompt):
     else:
         prompt = TASK_PROMPTS[task]["prompt"]
         has_grounding = TASK_PROMPTS[task]["has_grounding"]
-    result = _infer_with_prompt(image, prompt)
     if not result:
         return "No text detected", "", "", None, []
@@ -713,7 +715,7 @@ def process_image(image, task, custom_prompt):
     if has_grounding and '<|ref|>' in result:
         refs = extract_grounding_references(result)
-        if task == "📋 Markdown":
             refs.extend(_refine_equation_refs(image, result))
         if refs:
             img_out, crops = draw_bounding_boxes(image, refs, True)
@@ -726,7 +728,7 @@ def process_image(image, task, custom_prompt):
     return cleaned, markdown, result_for_layout, img_out, crops
 @spaces.GPU(duration=90)
-def process_pdf(path, task, custom_prompt, page_num):
     doc = fitz.open(path)
     total_pages = len(doc)
     if page_num < 1 or page_num > total_pages:
@@ -737,15 +739,34 @@ def process_pdf(path, task, custom_prompt, page_num):
     img = Image.open(BytesIO(pix.tobytes("png")))
     doc.close()
-    return process_image(img, task, custom_prompt)
-def process_file(path, task, custom_prompt, page_num):
     if not path:
         return "Error: Upload a file", "", "", None, []
     if path.lower().endswith('.pdf'):
-        return process_pdf(path, task, custom_prompt, page_num)
     else:
-        return process_image(Image.open(path), task, custom_prompt)
 def _extract_editor_image(editor_value):
     if editor_value is None:
@@ -761,12 +782,77 @@ def _extract_editor_image(editor_value):
             return background
     return None
-def process_region_ocr(editor_value):
     image = _extract_editor_image(editor_value)
     if image is None:
-        return "Draw/crop a region first, then click OCR Region.", ""
-    text, markdown, _, _, _ = process_image(image, "📝 Free OCR", "")
-    return text, to_math_html(markdown)
 def toggle_prompt(task):
     if task == "✏️ Custom":
@@ -829,6 +915,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             input_img = gr.Image(label="Input Image", type="pil", height=300)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
             with gr.Accordion("Region OCR (Draw/Crop)", open=False):
@@ -897,6 +984,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
         - **Describe**: General image description
         - **Custom**: Your own prompt
         - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
         ### Free OCR vs Locate (important)
         - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
@@ -920,35 +1008,39 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
         file_in.change(load_image, [file_in, page_selector], [region_editor])
         page_selector.change(load_image, [file_in, page_selector], [region_editor])
         input_img.change(lambda img: img, [input_img], [region_editor])
-        region_btn.click(process_region_ocr, [region_editor], [region_text_out, region_html_out])
-    def run(image, file_path, task, custom_prompt, page_num):
         if file_path:
-            cleaned, markdown, raw, img_out, crops = process_file(file_path, task, custom_prompt, int(page_num))
         elif image is not None:
-            cleaned, markdown, raw, img_out, crops = process_image(image, task, custom_prompt)
         else:
-            return "Error: Upload a file or image", "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False)
-        # Text tab: convert \[...\] → $$...$$ and \(...\) → $...$ for readability
-        text_display = re.sub(r'\\\[(.+?)\\\]',
-                              lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
-                              cleaned, flags=re.DOTALL)
-        text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
-        # Download file: write cleaned markdown to a temp .md file
-        dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
-        dl_tmp.write(cleaned)
-        dl_tmp.close()
-        mathjax_html = to_mathjax_html(markdown)
-        spatial_html = to_spatial_html(raw, markdown)
-        return (text_display, to_math_html(markdown), mathjax_html, mathjax_html, spatial_html, spatial_html, raw, img_out, crops,
-                gr.DownloadButton(value=dl_tmp.name, visible=True))
-    submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
-                             [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn])
     submit_event.then(select_boxes, [task], [tabs])
 if __name__ == "__main__":

         markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
     return markdown
+def _infer_with_prompt(image, prompt, crop_mode=None):
+    if crop_mode is None:
+        crop_mode = CROP_MODE
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
     image.save(tmp.name, 'JPEG', quality=95)
     tmp.close()
             output_path=out_dir,
             base_size=BASE_SIZE,
             image_size=IMAGE_SIZE,
+            crop_mode=crop_mode,
             save_results=False
         )
     finally:
     return refined_refs
 @spaces.GPU(duration=90)
+def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None):
     model.cuda()  # GPU is available here — works on ZeroGPU and locally
     if image is None:
         return "Error: Upload an image", "", "", None, []
     else:
         prompt = TASK_PROMPTS[task]["prompt"]
         has_grounding = TASK_PROMPTS[task]["has_grounding"]
+    result = _infer_with_prompt(image, prompt, crop_mode=infer_crop_mode)
     if not result:
         return "No text detected", "", "", None, []
     if has_grounding and '<|ref|>' in result:
         refs = extract_grounding_references(result)
+        if task == "📋 Markdown" and enable_equation_zoom:
             refs.extend(_refine_equation_refs(image, result))
         if refs:
             img_out, crops = draw_bounding_boxes(image, refs, True)
     return cleaned, markdown, result_for_layout, img_out, crops
 @spaces.GPU(duration=90)
+def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
     doc = fitz.open(path)
     total_pages = len(doc)
     if page_num < 1 or page_num > total_pages:
     img = Image.open(BytesIO(pix.tobytes("png")))
     doc.close()
+    return process_image(
+        img,
+        task,
+        custom_prompt,
+        enable_equation_zoom=enable_equation_zoom,
+        infer_crop_mode=infer_crop_mode,
+    )
+def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
     if not path:
         return "Error: Upload a file", "", "", None, []
     if path.lower().endswith('.pdf'):
+        return process_pdf(
+            path,
+            task,
+            custom_prompt,
+            page_num,
+            enable_equation_zoom=enable_equation_zoom,
+            infer_crop_mode=infer_crop_mode,
+        )
     else:
+        return process_image(
+            Image.open(path),
+            task,
+            custom_prompt,
+            enable_equation_zoom=enable_equation_zoom,
+            infer_crop_mode=infer_crop_mode,
+        )
 def _extract_editor_image(editor_value):
     if editor_value is None:
             return background
     return None
+def _dedupe_consecutive_lines(text: str) -> str:
+    if not text:
+        return text
+    out = []
+    prev = None
+    blank_count = 0
+    for line in text.splitlines():
+        if not line.strip():
+            blank_count += 1
+            if blank_count <= 2:
+                out.append("")
+            continue
+        blank_count = 0
+        norm = re.sub(r'\s+', ' ', line).strip()
+        if norm and norm == prev:
+            continue
+        out.append(line)
+        prev = norm
+    return "\n".join(out).strip()
+def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
+    text_display = re.sub(
+        r'\\\[(.+?)\\\]',
+        lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
+        cleaned,
+        flags=re.DOTALL
+    )
+    text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
+    dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
+    dl_tmp.write(cleaned)
+    dl_tmp.close()
+    markdown_html = to_math_html(markdown)
+    mathjax_html = to_mathjax_html(markdown)
+    spatial_html = to_spatial_html(raw, markdown)
+    return (
+        text_display,
+        markdown_html,
+        mathjax_html,
+        mathjax_html,
+        spatial_html,
+        spatial_html,
+        raw,
+        img_out,
+        gallery_items,
+        gr.DownloadButton(value=dl_tmp.name, visible=True),
+        text_display,
+        markdown_html,
+    )
+def run_region(editor_value, task, custom_prompt, enable_equation_zoom):
     image = _extract_editor_image(editor_value)
     if image is None:
+        msg = "Draw/crop a region first, then click OCR Region."
+        return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
+    cleaned, markdown, raw, img_out, crops = process_image(
+        image,
+        task,
+        custom_prompt,
+        enable_equation_zoom=enable_equation_zoom,
+        infer_crop_mode=False,
+    )
+    # Region workflows are single-area; collapse obvious duplicate lines.
+    cleaned = _dedupe_consecutive_lines(cleaned)
+    markdown = _dedupe_consecutive_lines(markdown)
+    gallery_items = [image] + (crops or [])
+    return _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items)
 def toggle_prompt(task):
     if task == "✏️ Custom":
             input_img = gr.Image(label="Input Image", type="pil", height=300)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
+            equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
             with gr.Accordion("Region OCR (Draw/Crop)", open=False):
         - **Describe**: General image description
         - **Custom**: Your own prompt
         - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
+        - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
         ### Free OCR vs Locate (important)
         - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
         file_in.change(load_image, [file_in, page_selector], [region_editor])
         page_selector.change(load_image, [file_in, page_selector], [region_editor])
         input_img.change(lambda img: img, [input_img], [region_editor])
+        region_btn.click(
+            run_region,
+            [region_editor, task, prompt, equation_zoom],
+            [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
+        )
+    def run(image, file_path, task, custom_prompt, page_num, enable_equation_zoom):
         if file_path:
+            cleaned, markdown, raw, img_out, crops = process_file(
+                file_path,
+                task,
+                custom_prompt,
+                int(page_num),
+                enable_equation_zoom=enable_equation_zoom,
+            )
         elif image is not None:
+            cleaned, markdown, raw, img_out, crops = process_image(
+                image,
+                task,
+                custom_prompt,
+                enable_equation_zoom=enable_equation_zoom,
+            )
         else:
+            msg = "Error: Upload a file or image"
+            return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
+        return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
+    submit_event = btn.click(
+        run,
+        [input_img, file_in, task, prompt, page_selector, equation_zoom],
+        [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
+    )
     submit_event.then(select_boxes, [task], [tabs])
 if __name__ == "__main__":