ricklon commited on
Commit
0ef2109
·
1 Parent(s): 930195f

Add equation-line separate OCR mode and freehand-first guidance

Browse files
Files changed (1) hide show
  1. app.py +135 -11
app.py CHANGED
@@ -51,6 +51,7 @@ CROP_MODE = True
51
  GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
52
  INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
53
  EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
 
54
  EQUATION_ZOOM_MAX_CANDIDATES = 6
55
  EQUATION_ZOOM_MIN_AREA = 0.05
56
  EQUATION_ZOOM_MIN_DIM = 0.24
@@ -58,6 +59,11 @@ EQUATION_ZOOM_PADDING = 0.025
58
  EQUATION_ZOOM_MAX_ASPECT = 12.0
59
  EQUATION_DETAIL_MAX_BOXES = 24
60
  EQUATION_DETAIL_IOU_DEDUPE = 0.7
 
 
 
 
 
61
  MATH_LABEL_HINTS = ("formula", "equation", "math")
62
  MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
63
  MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
@@ -682,17 +688,112 @@ def _refine_equation_refs(image, raw_text):
682
 
683
  return refined_refs
684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
  @spaces.GPU(duration=90)
686
- def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None):
687
  model.cuda() # GPU is available here — works on ZeroGPU and locally
688
  if image is None:
689
  return "Error: Upload an image", "", "", None, []
690
- if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
691
  return "Please enter a prompt", "", "", None, []
692
 
693
  if image.mode in ('RGBA', 'LA', 'P'):
694
  image = image.convert('RGB')
695
  image = ImageOps.exif_transpose(image)
 
 
 
 
 
 
 
696
 
697
  if task == "✏️ Custom":
698
  prompt = f"<image>\n{custom_prompt.strip()}"
@@ -730,7 +831,7 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
730
  return cleaned, markdown, result_for_layout, img_out, crops
731
 
732
  @spaces.GPU(duration=90)
733
- def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
734
  doc = fitz.open(path)
735
  total_pages = len(doc)
736
  if page_num < 1 or page_num > total_pages:
@@ -747,9 +848,10 @@ def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True,
747
  custom_prompt,
748
  enable_equation_zoom=enable_equation_zoom,
749
  infer_crop_mode=infer_crop_mode,
 
750
  )
751
 
752
- def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
753
  if not path:
754
  return "Error: Upload a file", "", "", None, []
755
  if path.lower().endswith('.pdf'):
@@ -760,6 +862,7 @@ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True,
760
  page_num,
761
  enable_equation_zoom=enable_equation_zoom,
762
  infer_crop_mode=infer_crop_mode,
 
763
  )
764
  else:
765
  return process_image(
@@ -768,6 +871,7 @@ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True,
768
  custom_prompt,
769
  enable_equation_zoom=enable_equation_zoom,
770
  infer_crop_mode=infer_crop_mode,
 
771
  )
772
 
773
  def _extract_editor_background(editor_value):
@@ -946,6 +1050,18 @@ def _draw_selected_region_boxes(image, boxes):
946
  def _region_gallery_items(regions):
947
  return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
948
 
 
 
 
 
 
 
 
 
 
 
 
 
949
  def _reset_selected_regions():
950
  return [], [], "No saved regions."
951
 
@@ -1077,7 +1193,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1077
  except TypeError:
1078
  editor_kwargs["eraser"] = gr.Eraser()
1079
  region_editor = gr.ImageEditor(
1080
- label="Main image workspace. Rectangle selection uses the Crop tool. Freehand/highlight uses a translucent overlay so you can still read content beneath.",
1081
  type="pil",
1082
  height=300,
1083
  **editor_kwargs,
@@ -1094,6 +1210,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1094
  selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
1095
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
1096
  equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
 
1097
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
1098
  btn = gr.Button("Extract", variant="primary", size="lg")
1099
 
@@ -1150,8 +1267,8 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1150
  - `Entire Page` for the full page.
1151
  - `Selected Region` for a specific area.
1152
  3. For `Selected Region`, use the **Image Workspace**:
1153
- - Rectangle selection: use the **Crop** tool.
1154
- - Freehand selection: draw/highlight the target; app uses an automatic bounding box around your marks.
1155
  - Freehand/highlight ink is semi-transparent so underlying content stays visible.
1156
  - Optional multi-select: click **Add Region** after each selection.
1157
  Then click **Extract**.
@@ -1167,6 +1284,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1167
  - **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
1168
  - **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
1169
  - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
 
1170
 
1171
  ### Free OCR vs Locate (important)
1172
  - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
@@ -1200,7 +1318,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1200
  outputs=[selected_regions_state, selected_regions_gallery, selection_status],
1201
  )
1202
 
1203
- def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size, base_image, selected_regions):
1204
  if scope == "Selected Region":
1205
  regions = list(selected_regions or [])
1206
  if not regions:
@@ -1213,13 +1331,15 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1213
  cleaned_parts = []
1214
  markdown_parts = []
1215
  raw_parts = []
 
1216
  for i, r in enumerate(regions, 1):
1217
- cleaned_i, markdown_i, raw_i, _, _ = process_image(
1218
  r["image"],
1219
  task,
1220
  custom_prompt,
1221
  enable_equation_zoom=enable_equation_zoom,
1222
  infer_crop_mode=False,
 
1223
  )
1224
  if len(regions) > 1:
1225
  cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
@@ -1229,11 +1349,13 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1229
  cleaned_parts.append(cleaned_i)
1230
  markdown_parts.append(markdown_i)
1231
  raw_parts.append(raw_i)
 
 
1232
 
1233
  cleaned = "\n\n".join(cleaned_parts).strip()
1234
  markdown = "\n\n".join(markdown_parts).strip()
1235
  raw = "\n\n".join(raw_parts).strip()
1236
- crops = _region_gallery_items(regions)
1237
  full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
1238
  region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
1239
  img_out = _draw_selected_region_boxes(full_img, region_boxes)
@@ -1243,6 +1365,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1243
  task,
1244
  custom_prompt,
1245
  enable_equation_zoom=enable_equation_zoom,
 
1246
  )
1247
  elif file_path:
1248
  cleaned, markdown, raw, img_out, crops = process_file(
@@ -1251,6 +1374,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1251
  custom_prompt,
1252
  int(page_num),
1253
  enable_equation_zoom=enable_equation_zoom,
 
1254
  )
1255
  else:
1256
  msg = "Error: Upload a file or image"
@@ -1260,7 +1384,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1260
 
1261
  submit_event = btn.click(
1262
  run,
1263
- [file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
1264
  [text_out, md_out, raw_out, img_out, gallery, download_btn]
1265
  )
1266
  submit_event.then(select_boxes, [task], [tabs])
 
51
  GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
52
  INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
53
  EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
54
+ EQUATION_LINE_OCR_PROMPT = "<image>\nRead the math expression exactly as written. Return only the equation text."
55
  EQUATION_ZOOM_MAX_CANDIDATES = 6
56
  EQUATION_ZOOM_MIN_AREA = 0.05
57
  EQUATION_ZOOM_MIN_DIM = 0.24
 
59
  EQUATION_ZOOM_MAX_ASPECT = 12.0
60
  EQUATION_DETAIL_MAX_BOXES = 24
61
  EQUATION_DETAIL_IOU_DEDUPE = 0.7
62
+ EQUATION_LINE_IOU_DEDUPE = 0.55
63
+ EQUATION_LINE_MIN_AREA = 0.0008
64
+ EQUATION_LINE_MIN_W = 0.03
65
+ EQUATION_LINE_MIN_H = 0.01
66
+ EQUATION_LINE_MAX_ASPECT = 30.0
67
  MATH_LABEL_HINTS = ("formula", "equation", "math")
68
  MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
69
  MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
 
688
 
689
  return refined_refs
690
 
691
+ def _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.0):
692
+ x1 = int(box[0] / 999.0 * img_w)
693
+ y1 = int(box[1] / 999.0 * img_h)
694
+ x2 = int(box[2] / 999.0 * img_w)
695
+ y2 = int(box[3] / 999.0 * img_h)
696
+ if pad_ratio > 0:
697
+ pad_x = max(1, int((x2 - x1) * pad_ratio))
698
+ pad_y = max(1, int((y2 - y1) * pad_ratio))
699
+ x1 -= pad_x
700
+ y1 -= pad_y
701
+ x2 += pad_x
702
+ y2 += pad_y
703
+ x1 = max(0, min(img_w - 1, x1))
704
+ y1 = max(0, min(img_h - 1, y1))
705
+ x2 = max(x1 + 1, min(img_w, x2))
706
+ y2 = max(y1 + 1, min(img_h, y2))
707
+ return (x1, y1, x2, y2)
708
+
709
+ def _detect_equation_line_boxes(image, infer_crop_mode=None):
710
+ detect_raw = _infer_with_prompt(image, EQUATION_ZOOM_PROMPT, crop_mode=infer_crop_mode)
711
+ entries = _extract_grounding_entries(detect_raw)
712
+ if not entries:
713
+ return [], detect_raw
714
+
715
+ boxes = []
716
+ for entry in entries:
717
+ label_l = entry["label"].lower()
718
+ text_chunk = entry["text"]
719
+ if label_l in ("image", "table"):
720
+ continue
721
+ for box in entry["coords"]:
722
+ w = (box[2] - box[0]) / 999.0
723
+ h = (box[3] - box[1]) / 999.0
724
+ area = w * h
725
+ aspect = max(w / max(1e-9, h), h / max(1e-9, w))
726
+ looks_math = any(hint in label_l for hint in MATH_LABEL_HINTS) or _math_marker_score(text_chunk) >= 2
727
+ if area < EQUATION_LINE_MIN_AREA or w < EQUATION_LINE_MIN_W or h < EQUATION_LINE_MIN_H:
728
+ continue
729
+ if aspect > EQUATION_LINE_MAX_ASPECT:
730
+ continue
731
+ if not looks_math and area < 0.004:
732
+ continue
733
+ boxes.append(box)
734
+
735
+ boxes = _dedupe_boxes(boxes, EQUATION_LINE_IOU_DEDUPE)
736
+ boxes = sorted(boxes, key=lambda b: (round(b[1], 3), b[0]))
737
+ return boxes, detect_raw
738
+
739
+ def _process_equation_lines_separately(image, infer_crop_mode=None):
740
+ boxes, detect_raw = _detect_equation_line_boxes(image, infer_crop_mode=infer_crop_mode)
741
+ if not boxes:
742
+ return None
743
+
744
+ img_w, img_h = image.size
745
+ cleaned_parts = []
746
+ markdown_parts = []
747
+ raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
748
+ refs = []
749
+ crops = []
750
+
751
+ for i, box in enumerate(boxes, 1):
752
+ x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
753
+ crop = image.crop((x1, y1, x2, y2))
754
+ line_raw = _infer_with_prompt(crop, EQUATION_LINE_OCR_PROMPT, crop_mode=False)
755
+ line_clean = clean_output(line_raw, False).strip()
756
+ if not line_clean:
757
+ continue
758
+ line_label = f"Eq {i}"
759
+ line_markdown = line_clean
760
+ if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
761
+ line_markdown = f"$$\n{line_markdown}\n$$"
762
+ cleaned_parts.append(f"{line_label}: {line_clean}")
763
+ markdown_parts.append(f"### {line_label}\n\n{line_markdown}")
764
+ raw_parts.append(f"## {line_label}\n\n{line_raw}")
765
+ coord_text = repr([box])
766
+ raw_ref = f'<|ref|>eq_line_{i}<|/ref|><|det|>{coord_text}<|/det|>'
767
+ refs.append((raw_ref, line_label, coord_text))
768
+ crops.append((crop, line_label))
769
+
770
+ if not cleaned_parts:
771
+ return None
772
+
773
+ img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
774
+ cleaned = "\n".join(cleaned_parts).strip()
775
+ markdown = "\n\n".join(markdown_parts).strip()
776
+ raw = "\n\n".join(raw_parts).strip()
777
+ return cleaned, markdown, raw, img_out, crops
778
+
779
  @spaces.GPU(duration=90)
780
+ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
781
  model.cuda() # GPU is available here — works on ZeroGPU and locally
782
  if image is None:
783
  return "Error: Upload an image", "", "", None, []
784
+ if not separate_equation_lines and task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
785
  return "Please enter a prompt", "", "", None, []
786
 
787
  if image.mode in ('RGBA', 'LA', 'P'):
788
  image = image.convert('RGB')
789
  image = ImageOps.exif_transpose(image)
790
+
791
+ if separate_equation_lines:
792
+ separate_result = _process_equation_lines_separately(image, infer_crop_mode=infer_crop_mode)
793
+ if separate_result is not None:
794
+ return separate_result
795
+ msg = "No separate equation lines detected. Try Selected Region + freehand highlight around the equation steps."
796
+ return msg, msg, msg, None, []
797
 
798
  if task == "✏️ Custom":
799
  prompt = f"<image>\n{custom_prompt.strip()}"
 
831
  return cleaned, markdown, result_for_layout, img_out, crops
832
 
833
  @spaces.GPU(duration=90)
834
+ def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
835
  doc = fitz.open(path)
836
  total_pages = len(doc)
837
  if page_num < 1 or page_num > total_pages:
 
848
  custom_prompt,
849
  enable_equation_zoom=enable_equation_zoom,
850
  infer_crop_mode=infer_crop_mode,
851
+ separate_equation_lines=separate_equation_lines,
852
  )
853
 
854
+ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
855
  if not path:
856
  return "Error: Upload a file", "", "", None, []
857
  if path.lower().endswith('.pdf'):
 
862
  page_num,
863
  enable_equation_zoom=enable_equation_zoom,
864
  infer_crop_mode=infer_crop_mode,
865
+ separate_equation_lines=separate_equation_lines,
866
  )
867
  else:
868
  return process_image(
 
871
  custom_prompt,
872
  enable_equation_zoom=enable_equation_zoom,
873
  infer_crop_mode=infer_crop_mode,
874
+ separate_equation_lines=separate_equation_lines,
875
  )
876
 
877
  def _extract_editor_background(editor_value):
 
1050
  def _region_gallery_items(regions):
1051
  return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
1052
 
1053
+ def _label_gallery_items(items, prefix=None):
1054
+ labeled = []
1055
+ for i, item in enumerate(items, 1):
1056
+ if isinstance(item, tuple) and len(item) >= 2:
1057
+ img, label = item[0], str(item[1])
1058
+ else:
1059
+ img, label = item, f"Item {i}"
1060
+ if prefix:
1061
+ label = f"{prefix} - {label}"
1062
+ labeled.append((img, label))
1063
+ return labeled
1064
+
1065
  def _reset_selected_regions():
1066
  return [], [], "No saved regions."
1067
 
 
1193
  except TypeError:
1194
  editor_kwargs["eraser"] = gr.Eraser()
1195
  region_editor = gr.ImageEditor(
1196
+ label="Main image workspace. Recommended: freehand/highlight the target area, then click Add Region. (Crop tool for rectangles is optional.)",
1197
  type="pil",
1198
  height=300,
1199
  **editor_kwargs,
 
1210
  selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
1211
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
1212
  equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
1213
+ separate_eq_lines = gr.Checkbox(label="Detect Equation Lines Separately", value=False)
1214
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
1215
  btn = gr.Button("Extract", variant="primary", size="lg")
1216
 
 
1267
  - `Entire Page` for the full page.
1268
  - `Selected Region` for a specific area.
1269
  3. For `Selected Region`, use the **Image Workspace**:
1270
+ - Recommended: freehand selection (draw/highlight target); app uses an automatic bounding box around your marks.
1271
+ - Optional rectangle selection: use the **Crop** tool.
1272
  - Freehand/highlight ink is semi-transparent so underlying content stays visible.
1273
  - Optional multi-select: click **Add Region** after each selection.
1274
  Then click **Extract**.
 
1284
  - **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
1285
  - **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
1286
  - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
1287
+ - **Detect Equation Lines Separately**: Detects likely equation-line boxes and OCRs each line independently to reduce merged multi-step equations.
1288
 
1289
  ### Free OCR vs Locate (important)
1290
  - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
 
1318
  outputs=[selected_regions_state, selected_regions_gallery, selection_status],
1319
  )
1320
 
1321
+ def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, detect_eq_lines, scope, region_value, base_size, base_image, selected_regions):
1322
  if scope == "Selected Region":
1323
  regions = list(selected_regions or [])
1324
  if not regions:
 
1331
  cleaned_parts = []
1332
  markdown_parts = []
1333
  raw_parts = []
1334
+ line_crops = []
1335
  for i, r in enumerate(regions, 1):
1336
+ cleaned_i, markdown_i, raw_i, _, crops_i = process_image(
1337
  r["image"],
1338
  task,
1339
  custom_prompt,
1340
  enable_equation_zoom=enable_equation_zoom,
1341
  infer_crop_mode=False,
1342
+ separate_equation_lines=detect_eq_lines,
1343
  )
1344
  if len(regions) > 1:
1345
  cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
 
1349
  cleaned_parts.append(cleaned_i)
1350
  markdown_parts.append(markdown_i)
1351
  raw_parts.append(raw_i)
1352
+ if detect_eq_lines and crops_i:
1353
+ line_crops.extend(_label_gallery_items(crops_i, prefix=f"Region {i}" if len(regions) > 1 else None))
1354
 
1355
  cleaned = "\n\n".join(cleaned_parts).strip()
1356
  markdown = "\n\n".join(markdown_parts).strip()
1357
  raw = "\n\n".join(raw_parts).strip()
1358
+ crops = line_crops if line_crops else _region_gallery_items(regions)
1359
  full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
1360
  region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
1361
  img_out = _draw_selected_region_boxes(full_img, region_boxes)
 
1365
  task,
1366
  custom_prompt,
1367
  enable_equation_zoom=enable_equation_zoom,
1368
+ separate_equation_lines=detect_eq_lines,
1369
  )
1370
  elif file_path:
1371
  cleaned, markdown, raw, img_out, crops = process_file(
 
1374
  custom_prompt,
1375
  int(page_num),
1376
  enable_equation_zoom=enable_equation_zoom,
1377
+ separate_equation_lines=detect_eq_lines,
1378
  )
1379
  else:
1380
  msg = "Error: Upload a file or image"
 
1384
 
1385
  submit_event = btn.click(
1386
  run,
1387
+ [file_in, task, prompt, page_selector, equation_zoom, separate_eq_lines, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
1388
  [text_out, md_out, raw_out, img_out, gallery, download_btn]
1389
  )
1390
  submit_event.then(select_boxes, [task], [tabs])