Spaces:

ricklon
/

DeepSeek-OCR-2-Math

Running on Zero

App Files Files Community

ricklon commited on 9 days ago

Commit

0ef2109

1 Parent(s): 930195f

Add equation-line separate OCR mode and freehand-first guidance

Browse files

Files changed (1) hide show

app.py +135 -11

app.py CHANGED Viewed

@@ -51,6 +51,7 @@ CROP_MODE = True
 GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
 INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
 EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
 EQUATION_ZOOM_MAX_CANDIDATES = 6
 EQUATION_ZOOM_MIN_AREA = 0.05
 EQUATION_ZOOM_MIN_DIM = 0.24
@@ -58,6 +59,11 @@ EQUATION_ZOOM_PADDING = 0.025
 EQUATION_ZOOM_MAX_ASPECT = 12.0
 EQUATION_DETAIL_MAX_BOXES = 24
 EQUATION_DETAIL_IOU_DEDUPE = 0.7
 MATH_LABEL_HINTS = ("formula", "equation", "math")
 MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
 MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
@@ -682,17 +688,112 @@ def _refine_equation_refs(image, raw_text):
     return refined_refs
 @spaces.GPU(duration=90)
-def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None):
     model.cuda()  # GPU is available here — works on ZeroGPU and locally
     if image is None:
         return "Error: Upload an image", "", "", None, []
-    if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
         return "Please enter a prompt", "", "", None, []
     if image.mode in ('RGBA', 'LA', 'P'):
         image = image.convert('RGB')
     image = ImageOps.exif_transpose(image)
     if task == "✏️ Custom":
         prompt = f"<image>\n{custom_prompt.strip()}"
@@ -730,7 +831,7 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
     return cleaned, markdown, result_for_layout, img_out, crops
 @spaces.GPU(duration=90)
-def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
     doc = fitz.open(path)
     total_pages = len(doc)
     if page_num < 1 or page_num > total_pages:
@@ -747,9 +848,10 @@ def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True,
         custom_prompt,
         enable_equation_zoom=enable_equation_zoom,
         infer_crop_mode=infer_crop_mode,
     )
-def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
     if not path:
         return "Error: Upload a file", "", "", None, []
     if path.lower().endswith('.pdf'):
@@ -760,6 +862,7 @@ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True,
             page_num,
             enable_equation_zoom=enable_equation_zoom,
             infer_crop_mode=infer_crop_mode,
         )
     else:
         return process_image(
@@ -768,6 +871,7 @@ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True,
             custom_prompt,
             enable_equation_zoom=enable_equation_zoom,
             infer_crop_mode=infer_crop_mode,
         )
 def _extract_editor_background(editor_value):
@@ -946,6 +1050,18 @@ def _draw_selected_region_boxes(image, boxes):
 def _region_gallery_items(regions):
     return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
 def _reset_selected_regions():
     return [], [], "No saved regions."
@@ -1077,7 +1193,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
                     except TypeError:
                         editor_kwargs["eraser"] = gr.Eraser()
                 region_editor = gr.ImageEditor(
-                    label="Main image workspace. Rectangle selection uses the Crop tool. Freehand/highlight uses a translucent overlay so you can still read content beneath.",
                     type="pil",
                     height=300,
                     **editor_kwargs,
@@ -1094,6 +1210,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
@@ -1150,8 +1267,8 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
            - `Entire Page` for the full page.
            - `Selected Region` for a specific area.
         3. For `Selected Region`, use the **Image Workspace**:
-           - Rectangle selection: use the **Crop** tool.
-           - Freehand selection: draw/highlight the target; app uses an automatic bounding box around your marks.
            - Freehand/highlight ink is semi-transparent so underlying content stays visible.
            - Optional multi-select: click **Add Region** after each selection.
            Then click **Extract**.
@@ -1167,6 +1284,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
         - **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
         - **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
         - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
         ### Free OCR vs Locate (important)
         - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
@@ -1200,7 +1318,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
         outputs=[selected_regions_state, selected_regions_gallery, selection_status],
     )
-    def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size, base_image, selected_regions):
         if scope == "Selected Region":
             regions = list(selected_regions or [])
             if not regions:
@@ -1213,13 +1331,15 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             cleaned_parts = []
             markdown_parts = []
             raw_parts = []
             for i, r in enumerate(regions, 1):
-                cleaned_i, markdown_i, raw_i, _, _ = process_image(
                     r["image"],
                     task,
                     custom_prompt,
                     enable_equation_zoom=enable_equation_zoom,
                     infer_crop_mode=False,
                 )
                 if len(regions) > 1:
                     cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
@@ -1229,11 +1349,13 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
                     cleaned_parts.append(cleaned_i)
                     markdown_parts.append(markdown_i)
                     raw_parts.append(raw_i)
             cleaned = "\n\n".join(cleaned_parts).strip()
             markdown = "\n\n".join(markdown_parts).strip()
             raw = "\n\n".join(raw_parts).strip()
-            crops = _region_gallery_items(regions)
             full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
             region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
             img_out = _draw_selected_region_boxes(full_img, region_boxes)
@@ -1243,6 +1365,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
                 task,
                 custom_prompt,
                 enable_equation_zoom=enable_equation_zoom,
             )
         elif file_path:
             cleaned, markdown, raw, img_out, crops = process_file(
@@ -1251,6 +1374,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
                 custom_prompt,
                 int(page_num),
                 enable_equation_zoom=enable_equation_zoom,
             )
         else:
             msg = "Error: Upload a file or image"
@@ -1260,7 +1384,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     submit_event = btn.click(
         run,
-        [file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
         [text_out, md_out, raw_out, img_out, gallery, download_btn]
     )
     submit_event.then(select_boxes, [task], [tabs])

 GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
 INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
 EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
+EQUATION_LINE_OCR_PROMPT = "<image>\nRead the math expression exactly as written. Return only the equation text."
 EQUATION_ZOOM_MAX_CANDIDATES = 6
 EQUATION_ZOOM_MIN_AREA = 0.05
 EQUATION_ZOOM_MIN_DIM = 0.24
 EQUATION_ZOOM_MAX_ASPECT = 12.0
 EQUATION_DETAIL_MAX_BOXES = 24
 EQUATION_DETAIL_IOU_DEDUPE = 0.7
+EQUATION_LINE_IOU_DEDUPE = 0.55
+EQUATION_LINE_MIN_AREA = 0.0008
+EQUATION_LINE_MIN_W = 0.03
+EQUATION_LINE_MIN_H = 0.01
+EQUATION_LINE_MAX_ASPECT = 30.0
 MATH_LABEL_HINTS = ("formula", "equation", "math")
 MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
 MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
     return refined_refs
+def _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.0):
+    x1 = int(box[0] / 999.0 * img_w)
+    y1 = int(box[1] / 999.0 * img_h)
+    x2 = int(box[2] / 999.0 * img_w)
+    y2 = int(box[3] / 999.0 * img_h)
+    if pad_ratio > 0:
+        pad_x = max(1, int((x2 - x1) * pad_ratio))
+        pad_y = max(1, int((y2 - y1) * pad_ratio))
+        x1 -= pad_x
+        y1 -= pad_y
+        x2 += pad_x
+        y2 += pad_y
+    x1 = max(0, min(img_w - 1, x1))
+    y1 = max(0, min(img_h - 1, y1))
+    x2 = max(x1 + 1, min(img_w, x2))
+    y2 = max(y1 + 1, min(img_h, y2))
+    return (x1, y1, x2, y2)
+def _detect_equation_line_boxes(image, infer_crop_mode=None):
+    detect_raw = _infer_with_prompt(image, EQUATION_ZOOM_PROMPT, crop_mode=infer_crop_mode)
+    entries = _extract_grounding_entries(detect_raw)
+    if not entries:
+        return [], detect_raw
+    boxes = []
+    for entry in entries:
+        label_l = entry["label"].lower()
+        text_chunk = entry["text"]
+        if label_l in ("image", "table"):
+            continue
+        for box in entry["coords"]:
+            w = (box[2] - box[0]) / 999.0
+            h = (box[3] - box[1]) / 999.0
+            area = w * h
+            aspect = max(w / max(1e-9, h), h / max(1e-9, w))
+            looks_math = any(hint in label_l for hint in MATH_LABEL_HINTS) or _math_marker_score(text_chunk) >= 2
+            if area < EQUATION_LINE_MIN_AREA or w < EQUATION_LINE_MIN_W or h < EQUATION_LINE_MIN_H:
+                continue
+            if aspect > EQUATION_LINE_MAX_ASPECT:
+                continue
+            if not looks_math and area < 0.004:
+                continue
+            boxes.append(box)
+    boxes = _dedupe_boxes(boxes, EQUATION_LINE_IOU_DEDUPE)
+    boxes = sorted(boxes, key=lambda b: (round(b[1], 3), b[0]))
+    return boxes, detect_raw
+def _process_equation_lines_separately(image, infer_crop_mode=None):
+    boxes, detect_raw = _detect_equation_line_boxes(image, infer_crop_mode=infer_crop_mode)
+    if not boxes:
+        return None
+    img_w, img_h = image.size
+    cleaned_parts = []
+    markdown_parts = []
+    raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
+    refs = []
+    crops = []
+    for i, box in enumerate(boxes, 1):
+        x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
+        crop = image.crop((x1, y1, x2, y2))
+        line_raw = _infer_with_prompt(crop, EQUATION_LINE_OCR_PROMPT, crop_mode=False)
+        line_clean = clean_output(line_raw, False).strip()
+        if not line_clean:
+            continue
+        line_label = f"Eq {i}"
+        line_markdown = line_clean
+        if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
+            line_markdown = f"$$\n{line_markdown}\n$$"
+        cleaned_parts.append(f"{line_label}: {line_clean}")
+        markdown_parts.append(f"### {line_label}\n\n{line_markdown}")
+        raw_parts.append(f"## {line_label}\n\n{line_raw}")
+        coord_text = repr([box])
+        raw_ref = f'<|ref|>eq_line_{i}<|/ref|><|det|>{coord_text}<|/det|>'
+        refs.append((raw_ref, line_label, coord_text))
+        crops.append((crop, line_label))
+    if not cleaned_parts:
+        return None
+    img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
+    cleaned = "\n".join(cleaned_parts).strip()
+    markdown = "\n\n".join(markdown_parts).strip()
+    raw = "\n\n".join(raw_parts).strip()
+    return cleaned, markdown, raw, img_out, crops
 @spaces.GPU(duration=90)
+def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
     model.cuda()  # GPU is available here — works on ZeroGPU and locally
     if image is None:
         return "Error: Upload an image", "", "", None, []
+    if not separate_equation_lines and task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
         return "Please enter a prompt", "", "", None, []
     if image.mode in ('RGBA', 'LA', 'P'):
         image = image.convert('RGB')
     image = ImageOps.exif_transpose(image)
+    if separate_equation_lines:
+        separate_result = _process_equation_lines_separately(image, infer_crop_mode=infer_crop_mode)
+        if separate_result is not None:
+            return separate_result
+        msg = "No separate equation lines detected. Try Selected Region + freehand highlight around the equation steps."
+        return msg, msg, msg, None, []
     if task == "✏️ Custom":
         prompt = f"<image>\n{custom_prompt.strip()}"
     return cleaned, markdown, result_for_layout, img_out, crops
 @spaces.GPU(duration=90)
+def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
     doc = fitz.open(path)
     total_pages = len(doc)
     if page_num < 1 or page_num > total_pages:
         custom_prompt,
         enable_equation_zoom=enable_equation_zoom,
         infer_crop_mode=infer_crop_mode,
+        separate_equation_lines=separate_equation_lines,
     )
+def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
     if not path:
         return "Error: Upload a file", "", "", None, []
     if path.lower().endswith('.pdf'):
             page_num,
             enable_equation_zoom=enable_equation_zoom,
             infer_crop_mode=infer_crop_mode,
+            separate_equation_lines=separate_equation_lines,
         )
     else:
         return process_image(
             custom_prompt,
             enable_equation_zoom=enable_equation_zoom,
             infer_crop_mode=infer_crop_mode,
+            separate_equation_lines=separate_equation_lines,
         )
 def _extract_editor_background(editor_value):
 def _region_gallery_items(regions):
     return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
+def _label_gallery_items(items, prefix=None):
+    labeled = []
+    for i, item in enumerate(items, 1):
+        if isinstance(item, tuple) and len(item) >= 2:
+            img, label = item[0], str(item[1])
+        else:
+            img, label = item, f"Item {i}"
+        if prefix:
+            label = f"{prefix} - {label}"
+        labeled.append((img, label))
+    return labeled
 def _reset_selected_regions():
     return [], [], "No saved regions."
                     except TypeError:
                         editor_kwargs["eraser"] = gr.Eraser()
                 region_editor = gr.ImageEditor(
+                    label="Main image workspace. Recommended: freehand/highlight the target area, then click Add Region. (Crop tool for rectangles is optional.)",
                     type="pil",
                     height=300,
                     **editor_kwargs,
             selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
             task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
             equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
+            separate_eq_lines = gr.Checkbox(label="Detect Equation Lines Separately", value=False)
             prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
            - `Entire Page` for the full page.
            - `Selected Region` for a specific area.
         3. For `Selected Region`, use the **Image Workspace**:
+           - Recommended: freehand selection (draw/highlight target); app uses an automatic bounding box around your marks.
+           - Optional rectangle selection: use the **Crop** tool.
            - Freehand/highlight ink is semi-transparent so underlying content stays visible.
            - Optional multi-select: click **Add Region** after each selection.
            Then click **Extract**.
         - **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
         - **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
         - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
+        - **Detect Equation Lines Separately**: Detects likely equation-line boxes and OCRs each line independently to reduce merged multi-step equations.
         ### Free OCR vs Locate (important)
         - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
         outputs=[selected_regions_state, selected_regions_gallery, selection_status],
     )
+    def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, detect_eq_lines, scope, region_value, base_size, base_image, selected_regions):
         if scope == "Selected Region":
             regions = list(selected_regions or [])
             if not regions:
             cleaned_parts = []
             markdown_parts = []
             raw_parts = []
+            line_crops = []
             for i, r in enumerate(regions, 1):
+                cleaned_i, markdown_i, raw_i, _, crops_i = process_image(
                     r["image"],
                     task,
                     custom_prompt,
                     enable_equation_zoom=enable_equation_zoom,
                     infer_crop_mode=False,
+                    separate_equation_lines=detect_eq_lines,
                 )
                 if len(regions) > 1:
                     cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
                     cleaned_parts.append(cleaned_i)
                     markdown_parts.append(markdown_i)
                     raw_parts.append(raw_i)
+                if detect_eq_lines and crops_i:
+                    line_crops.extend(_label_gallery_items(crops_i, prefix=f"Region {i}" if len(regions) > 1 else None))
             cleaned = "\n\n".join(cleaned_parts).strip()
             markdown = "\n\n".join(markdown_parts).strip()
             raw = "\n\n".join(raw_parts).strip()
+            crops = line_crops if line_crops else _region_gallery_items(regions)
             full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
             region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
             img_out = _draw_selected_region_boxes(full_img, region_boxes)
                 task,
                 custom_prompt,
                 enable_equation_zoom=enable_equation_zoom,
+                separate_equation_lines=detect_eq_lines,
             )
         elif file_path:
             cleaned, markdown, raw, img_out, crops = process_file(
                 custom_prompt,
                 int(page_num),
                 enable_equation_zoom=enable_equation_zoom,
+                separate_equation_lines=detect_eq_lines,
             )
         else:
             msg = "Error: Upload a file or image"
     submit_event = btn.click(
         run,
+        [file_in, task, prompt, page_selector, equation_zoom, separate_eq_lines, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
         [text_out, md_out, raw_out, img_out, gallery, download_btn]
     )
     submit_event.then(select_boxes, [task], [tabs])