Spaces:
Running on Zero
Running on Zero
Add equation-line separate OCR mode and freehand-first guidance
Browse files
app.py
CHANGED
|
@@ -51,6 +51,7 @@ CROP_MODE = True
|
|
| 51 |
GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
|
| 52 |
INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
|
| 53 |
EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
|
|
|
|
| 54 |
EQUATION_ZOOM_MAX_CANDIDATES = 6
|
| 55 |
EQUATION_ZOOM_MIN_AREA = 0.05
|
| 56 |
EQUATION_ZOOM_MIN_DIM = 0.24
|
|
@@ -58,6 +59,11 @@ EQUATION_ZOOM_PADDING = 0.025
|
|
| 58 |
EQUATION_ZOOM_MAX_ASPECT = 12.0
|
| 59 |
EQUATION_DETAIL_MAX_BOXES = 24
|
| 60 |
EQUATION_DETAIL_IOU_DEDUPE = 0.7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
MATH_LABEL_HINTS = ("formula", "equation", "math")
|
| 62 |
MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
|
| 63 |
MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
|
|
@@ -682,17 +688,112 @@ def _refine_equation_refs(image, raw_text):
|
|
| 682 |
|
| 683 |
return refined_refs
|
| 684 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
@spaces.GPU(duration=90)
|
| 686 |
-
def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None):
|
| 687 |
model.cuda() # GPU is available here — works on ZeroGPU and locally
|
| 688 |
if image is None:
|
| 689 |
return "Error: Upload an image", "", "", None, []
|
| 690 |
-
if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
|
| 691 |
return "Please enter a prompt", "", "", None, []
|
| 692 |
|
| 693 |
if image.mode in ('RGBA', 'LA', 'P'):
|
| 694 |
image = image.convert('RGB')
|
| 695 |
image = ImageOps.exif_transpose(image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
|
| 697 |
if task == "✏️ Custom":
|
| 698 |
prompt = f"<image>\n{custom_prompt.strip()}"
|
|
@@ -730,7 +831,7 @@ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_c
|
|
| 730 |
return cleaned, markdown, result_for_layout, img_out, crops
|
| 731 |
|
| 732 |
@spaces.GPU(duration=90)
|
| 733 |
-
def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
|
| 734 |
doc = fitz.open(path)
|
| 735 |
total_pages = len(doc)
|
| 736 |
if page_num < 1 or page_num > total_pages:
|
|
@@ -747,9 +848,10 @@ def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True,
|
|
| 747 |
custom_prompt,
|
| 748 |
enable_equation_zoom=enable_equation_zoom,
|
| 749 |
infer_crop_mode=infer_crop_mode,
|
|
|
|
| 750 |
)
|
| 751 |
|
| 752 |
-
def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
|
| 753 |
if not path:
|
| 754 |
return "Error: Upload a file", "", "", None, []
|
| 755 |
if path.lower().endswith('.pdf'):
|
|
@@ -760,6 +862,7 @@ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True,
|
|
| 760 |
page_num,
|
| 761 |
enable_equation_zoom=enable_equation_zoom,
|
| 762 |
infer_crop_mode=infer_crop_mode,
|
|
|
|
| 763 |
)
|
| 764 |
else:
|
| 765 |
return process_image(
|
|
@@ -768,6 +871,7 @@ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True,
|
|
| 768 |
custom_prompt,
|
| 769 |
enable_equation_zoom=enable_equation_zoom,
|
| 770 |
infer_crop_mode=infer_crop_mode,
|
|
|
|
| 771 |
)
|
| 772 |
|
| 773 |
def _extract_editor_background(editor_value):
|
|
@@ -946,6 +1050,18 @@ def _draw_selected_region_boxes(image, boxes):
|
|
| 946 |
def _region_gallery_items(regions):
|
| 947 |
return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
|
| 948 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
def _reset_selected_regions():
|
| 950 |
return [], [], "No saved regions."
|
| 951 |
|
|
@@ -1077,7 +1193,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1077 |
except TypeError:
|
| 1078 |
editor_kwargs["eraser"] = gr.Eraser()
|
| 1079 |
region_editor = gr.ImageEditor(
|
| 1080 |
-
label="Main image workspace.
|
| 1081 |
type="pil",
|
| 1082 |
height=300,
|
| 1083 |
**editor_kwargs,
|
|
@@ -1094,6 +1210,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1094 |
selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
|
| 1095 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 1096 |
equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
|
|
|
|
| 1097 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 1098 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 1099 |
|
|
@@ -1150,8 +1267,8 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1150 |
- `Entire Page` for the full page.
|
| 1151 |
- `Selected Region` for a specific area.
|
| 1152 |
3. For `Selected Region`, use the **Image Workspace**:
|
| 1153 |
-
-
|
| 1154 |
-
-
|
| 1155 |
- Freehand/highlight ink is semi-transparent so underlying content stays visible.
|
| 1156 |
- Optional multi-select: click **Add Region** after each selection.
|
| 1157 |
Then click **Extract**.
|
|
@@ -1167,6 +1284,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1167 |
- **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
|
| 1168 |
- **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
|
| 1169 |
- **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
|
|
|
|
| 1170 |
|
| 1171 |
### Free OCR vs Locate (important)
|
| 1172 |
- **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
|
|
@@ -1200,7 +1318,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1200 |
outputs=[selected_regions_state, selected_regions_gallery, selection_status],
|
| 1201 |
)
|
| 1202 |
|
| 1203 |
-
def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size, base_image, selected_regions):
|
| 1204 |
if scope == "Selected Region":
|
| 1205 |
regions = list(selected_regions or [])
|
| 1206 |
if not regions:
|
|
@@ -1213,13 +1331,15 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1213 |
cleaned_parts = []
|
| 1214 |
markdown_parts = []
|
| 1215 |
raw_parts = []
|
|
|
|
| 1216 |
for i, r in enumerate(regions, 1):
|
| 1217 |
-
cleaned_i, markdown_i, raw_i, _,
|
| 1218 |
r["image"],
|
| 1219 |
task,
|
| 1220 |
custom_prompt,
|
| 1221 |
enable_equation_zoom=enable_equation_zoom,
|
| 1222 |
infer_crop_mode=False,
|
|
|
|
| 1223 |
)
|
| 1224 |
if len(regions) > 1:
|
| 1225 |
cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
|
|
@@ -1229,11 +1349,13 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1229 |
cleaned_parts.append(cleaned_i)
|
| 1230 |
markdown_parts.append(markdown_i)
|
| 1231 |
raw_parts.append(raw_i)
|
|
|
|
|
|
|
| 1232 |
|
| 1233 |
cleaned = "\n\n".join(cleaned_parts).strip()
|
| 1234 |
markdown = "\n\n".join(markdown_parts).strip()
|
| 1235 |
raw = "\n\n".join(raw_parts).strip()
|
| 1236 |
-
crops = _region_gallery_items(regions)
|
| 1237 |
full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
|
| 1238 |
region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
|
| 1239 |
img_out = _draw_selected_region_boxes(full_img, region_boxes)
|
|
@@ -1243,6 +1365,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1243 |
task,
|
| 1244 |
custom_prompt,
|
| 1245 |
enable_equation_zoom=enable_equation_zoom,
|
|
|
|
| 1246 |
)
|
| 1247 |
elif file_path:
|
| 1248 |
cleaned, markdown, raw, img_out, crops = process_file(
|
|
@@ -1251,6 +1374,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1251 |
custom_prompt,
|
| 1252 |
int(page_num),
|
| 1253 |
enable_equation_zoom=enable_equation_zoom,
|
|
|
|
| 1254 |
)
|
| 1255 |
else:
|
| 1256 |
msg = "Error: Upload a file or image"
|
|
@@ -1260,7 +1384,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1260 |
|
| 1261 |
submit_event = btn.click(
|
| 1262 |
run,
|
| 1263 |
-
[file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
|
| 1264 |
[text_out, md_out, raw_out, img_out, gallery, download_btn]
|
| 1265 |
)
|
| 1266 |
submit_event.then(select_boxes, [task], [tabs])
|
|
|
|
| 51 |
GROUNDING_PATTERN = re.compile(r'<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>', re.DOTALL)
|
| 52 |
INFER_DEBUG_FILTERS = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
|
| 53 |
EQUATION_ZOOM_PROMPT = "<image>\n<|grounding|>Locate each individual equation or math line."
|
| 54 |
+
EQUATION_LINE_OCR_PROMPT = "<image>\nRead the math expression exactly as written. Return only the equation text."
|
| 55 |
EQUATION_ZOOM_MAX_CANDIDATES = 6
|
| 56 |
EQUATION_ZOOM_MIN_AREA = 0.05
|
| 57 |
EQUATION_ZOOM_MIN_DIM = 0.24
|
|
|
|
| 59 |
EQUATION_ZOOM_MAX_ASPECT = 12.0
|
| 60 |
EQUATION_DETAIL_MAX_BOXES = 24
|
| 61 |
EQUATION_DETAIL_IOU_DEDUPE = 0.7
|
| 62 |
+
EQUATION_LINE_IOU_DEDUPE = 0.55
|
| 63 |
+
EQUATION_LINE_MIN_AREA = 0.0008
|
| 64 |
+
EQUATION_LINE_MIN_W = 0.03
|
| 65 |
+
EQUATION_LINE_MIN_H = 0.01
|
| 66 |
+
EQUATION_LINE_MAX_ASPECT = 30.0
|
| 67 |
MATH_LABEL_HINTS = ("formula", "equation", "math")
|
| 68 |
MATH_STRONG_MARKERS = ("\\(", "\\[", "\\frac", "\\sum", "\\int", "\\sqrt", "\\lim", "\\begin{")
|
| 69 |
MATH_WEAK_MARKERS = ("^", "_", "=", "+", "\\cdot", "\\times")
|
|
|
|
| 688 |
|
| 689 |
return refined_refs
|
| 690 |
|
| 691 |
+
def _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.0):
|
| 692 |
+
x1 = int(box[0] / 999.0 * img_w)
|
| 693 |
+
y1 = int(box[1] / 999.0 * img_h)
|
| 694 |
+
x2 = int(box[2] / 999.0 * img_w)
|
| 695 |
+
y2 = int(box[3] / 999.0 * img_h)
|
| 696 |
+
if pad_ratio > 0:
|
| 697 |
+
pad_x = max(1, int((x2 - x1) * pad_ratio))
|
| 698 |
+
pad_y = max(1, int((y2 - y1) * pad_ratio))
|
| 699 |
+
x1 -= pad_x
|
| 700 |
+
y1 -= pad_y
|
| 701 |
+
x2 += pad_x
|
| 702 |
+
y2 += pad_y
|
| 703 |
+
x1 = max(0, min(img_w - 1, x1))
|
| 704 |
+
y1 = max(0, min(img_h - 1, y1))
|
| 705 |
+
x2 = max(x1 + 1, min(img_w, x2))
|
| 706 |
+
y2 = max(y1 + 1, min(img_h, y2))
|
| 707 |
+
return (x1, y1, x2, y2)
|
| 708 |
+
|
| 709 |
+
def _detect_equation_line_boxes(image, infer_crop_mode=None):
|
| 710 |
+
detect_raw = _infer_with_prompt(image, EQUATION_ZOOM_PROMPT, crop_mode=infer_crop_mode)
|
| 711 |
+
entries = _extract_grounding_entries(detect_raw)
|
| 712 |
+
if not entries:
|
| 713 |
+
return [], detect_raw
|
| 714 |
+
|
| 715 |
+
boxes = []
|
| 716 |
+
for entry in entries:
|
| 717 |
+
label_l = entry["label"].lower()
|
| 718 |
+
text_chunk = entry["text"]
|
| 719 |
+
if label_l in ("image", "table"):
|
| 720 |
+
continue
|
| 721 |
+
for box in entry["coords"]:
|
| 722 |
+
w = (box[2] - box[0]) / 999.0
|
| 723 |
+
h = (box[3] - box[1]) / 999.0
|
| 724 |
+
area = w * h
|
| 725 |
+
aspect = max(w / max(1e-9, h), h / max(1e-9, w))
|
| 726 |
+
looks_math = any(hint in label_l for hint in MATH_LABEL_HINTS) or _math_marker_score(text_chunk) >= 2
|
| 727 |
+
if area < EQUATION_LINE_MIN_AREA or w < EQUATION_LINE_MIN_W or h < EQUATION_LINE_MIN_H:
|
| 728 |
+
continue
|
| 729 |
+
if aspect > EQUATION_LINE_MAX_ASPECT:
|
| 730 |
+
continue
|
| 731 |
+
if not looks_math and area < 0.004:
|
| 732 |
+
continue
|
| 733 |
+
boxes.append(box)
|
| 734 |
+
|
| 735 |
+
boxes = _dedupe_boxes(boxes, EQUATION_LINE_IOU_DEDUPE)
|
| 736 |
+
boxes = sorted(boxes, key=lambda b: (round(b[1], 3), b[0]))
|
| 737 |
+
return boxes, detect_raw
|
| 738 |
+
|
| 739 |
+
def _process_equation_lines_separately(image, infer_crop_mode=None):
|
| 740 |
+
boxes, detect_raw = _detect_equation_line_boxes(image, infer_crop_mode=infer_crop_mode)
|
| 741 |
+
if not boxes:
|
| 742 |
+
return None
|
| 743 |
+
|
| 744 |
+
img_w, img_h = image.size
|
| 745 |
+
cleaned_parts = []
|
| 746 |
+
markdown_parts = []
|
| 747 |
+
raw_parts = [f"## Detection\n\n{detect_raw}".strip()]
|
| 748 |
+
refs = []
|
| 749 |
+
crops = []
|
| 750 |
+
|
| 751 |
+
for i, box in enumerate(boxes, 1):
|
| 752 |
+
x1, y1, x2, y2 = _norm_box_to_pixels(box, img_w, img_h, pad_ratio=0.01)
|
| 753 |
+
crop = image.crop((x1, y1, x2, y2))
|
| 754 |
+
line_raw = _infer_with_prompt(crop, EQUATION_LINE_OCR_PROMPT, crop_mode=False)
|
| 755 |
+
line_clean = clean_output(line_raw, False).strip()
|
| 756 |
+
if not line_clean:
|
| 757 |
+
continue
|
| 758 |
+
line_label = f"Eq {i}"
|
| 759 |
+
line_markdown = line_clean
|
| 760 |
+
if "$$" not in line_markdown and "\\[" not in line_markdown and "\\(" not in line_markdown:
|
| 761 |
+
line_markdown = f"$$\n{line_markdown}\n$$"
|
| 762 |
+
cleaned_parts.append(f"{line_label}: {line_clean}")
|
| 763 |
+
markdown_parts.append(f"### {line_label}\n\n{line_markdown}")
|
| 764 |
+
raw_parts.append(f"## {line_label}\n\n{line_raw}")
|
| 765 |
+
coord_text = repr([box])
|
| 766 |
+
raw_ref = f'<|ref|>eq_line_{i}<|/ref|><|det|>{coord_text}<|/det|>'
|
| 767 |
+
refs.append((raw_ref, line_label, coord_text))
|
| 768 |
+
crops.append((crop, line_label))
|
| 769 |
+
|
| 770 |
+
if not cleaned_parts:
|
| 771 |
+
return None
|
| 772 |
+
|
| 773 |
+
img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
|
| 774 |
+
cleaned = "\n".join(cleaned_parts).strip()
|
| 775 |
+
markdown = "\n\n".join(markdown_parts).strip()
|
| 776 |
+
raw = "\n\n".join(raw_parts).strip()
|
| 777 |
+
return cleaned, markdown, raw, img_out, crops
|
| 778 |
+
|
| 779 |
@spaces.GPU(duration=90)
|
| 780 |
+
def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
|
| 781 |
model.cuda() # GPU is available here — works on ZeroGPU and locally
|
| 782 |
if image is None:
|
| 783 |
return "Error: Upload an image", "", "", None, []
|
| 784 |
+
if not separate_equation_lines and task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
|
| 785 |
return "Please enter a prompt", "", "", None, []
|
| 786 |
|
| 787 |
if image.mode in ('RGBA', 'LA', 'P'):
|
| 788 |
image = image.convert('RGB')
|
| 789 |
image = ImageOps.exif_transpose(image)
|
| 790 |
+
|
| 791 |
+
if separate_equation_lines:
|
| 792 |
+
separate_result = _process_equation_lines_separately(image, infer_crop_mode=infer_crop_mode)
|
| 793 |
+
if separate_result is not None:
|
| 794 |
+
return separate_result
|
| 795 |
+
msg = "No separate equation lines detected. Try Selected Region + freehand highlight around the equation steps."
|
| 796 |
+
return msg, msg, msg, None, []
|
| 797 |
|
| 798 |
if task == "✏️ Custom":
|
| 799 |
prompt = f"<image>\n{custom_prompt.strip()}"
|
|
|
|
| 831 |
return cleaned, markdown, result_for_layout, img_out, crops
|
| 832 |
|
| 833 |
@spaces.GPU(duration=90)
|
| 834 |
+
def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
|
| 835 |
doc = fitz.open(path)
|
| 836 |
total_pages = len(doc)
|
| 837 |
if page_num < 1 or page_num > total_pages:
|
|
|
|
| 848 |
custom_prompt,
|
| 849 |
enable_equation_zoom=enable_equation_zoom,
|
| 850 |
infer_crop_mode=infer_crop_mode,
|
| 851 |
+
separate_equation_lines=separate_equation_lines,
|
| 852 |
)
|
| 853 |
|
| 854 |
+
def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None, separate_equation_lines=False):
|
| 855 |
if not path:
|
| 856 |
return "Error: Upload a file", "", "", None, []
|
| 857 |
if path.lower().endswith('.pdf'):
|
|
|
|
| 862 |
page_num,
|
| 863 |
enable_equation_zoom=enable_equation_zoom,
|
| 864 |
infer_crop_mode=infer_crop_mode,
|
| 865 |
+
separate_equation_lines=separate_equation_lines,
|
| 866 |
)
|
| 867 |
else:
|
| 868 |
return process_image(
|
|
|
|
| 871 |
custom_prompt,
|
| 872 |
enable_equation_zoom=enable_equation_zoom,
|
| 873 |
infer_crop_mode=infer_crop_mode,
|
| 874 |
+
separate_equation_lines=separate_equation_lines,
|
| 875 |
)
|
| 876 |
|
| 877 |
def _extract_editor_background(editor_value):
|
|
|
|
| 1050 |
def _region_gallery_items(regions):
|
| 1051 |
return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
|
| 1052 |
|
| 1053 |
+
def _label_gallery_items(items, prefix=None):
|
| 1054 |
+
labeled = []
|
| 1055 |
+
for i, item in enumerate(items, 1):
|
| 1056 |
+
if isinstance(item, tuple) and len(item) >= 2:
|
| 1057 |
+
img, label = item[0], str(item[1])
|
| 1058 |
+
else:
|
| 1059 |
+
img, label = item, f"Item {i}"
|
| 1060 |
+
if prefix:
|
| 1061 |
+
label = f"{prefix} - {label}"
|
| 1062 |
+
labeled.append((img, label))
|
| 1063 |
+
return labeled
|
| 1064 |
+
|
| 1065 |
def _reset_selected_regions():
|
| 1066 |
return [], [], "No saved regions."
|
| 1067 |
|
|
|
|
| 1193 |
except TypeError:
|
| 1194 |
editor_kwargs["eraser"] = gr.Eraser()
|
| 1195 |
region_editor = gr.ImageEditor(
|
| 1196 |
+
label="Main image workspace. Recommended: freehand/highlight the target area, then click Add Region. (Crop tool for rectangles is optional.)",
|
| 1197 |
type="pil",
|
| 1198 |
height=300,
|
| 1199 |
**editor_kwargs,
|
|
|
|
| 1210 |
selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
|
| 1211 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 1212 |
equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
|
| 1213 |
+
separate_eq_lines = gr.Checkbox(label="Detect Equation Lines Separately", value=False)
|
| 1214 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 1215 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 1216 |
|
|
|
|
| 1267 |
- `Entire Page` for the full page.
|
| 1268 |
- `Selected Region` for a specific area.
|
| 1269 |
3. For `Selected Region`, use the **Image Workspace**:
|
| 1270 |
+
- Recommended: freehand selection (draw/highlight target); app uses an automatic bounding box around your marks.
|
| 1271 |
+
- Optional rectangle selection: use the **Crop** tool.
|
| 1272 |
- Freehand/highlight ink is semi-transparent so underlying content stays visible.
|
| 1273 |
- Optional multi-select: click **Add Region** after each selection.
|
| 1274 |
Then click **Extract**.
|
|
|
|
| 1284 |
- **Region selection**: Use **Input Scope=Selected Region**, draw/crop in the Image Workspace, then click **Extract**
|
| 1285 |
- **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the workspace crop as main input)
|
| 1286 |
- **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
|
| 1287 |
+
- **Detect Equation Lines Separately**: Detects likely equation-line boxes and OCRs each line independently to reduce merged multi-step equations.
|
| 1288 |
|
| 1289 |
### Free OCR vs Locate (important)
|
| 1290 |
- **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
|
|
|
|
| 1318 |
outputs=[selected_regions_state, selected_regions_gallery, selection_status],
|
| 1319 |
)
|
| 1320 |
|
| 1321 |
+
def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, detect_eq_lines, scope, region_value, base_size, base_image, selected_regions):
|
| 1322 |
if scope == "Selected Region":
|
| 1323 |
regions = list(selected_regions or [])
|
| 1324 |
if not regions:
|
|
|
|
| 1331 |
cleaned_parts = []
|
| 1332 |
markdown_parts = []
|
| 1333 |
raw_parts = []
|
| 1334 |
+
line_crops = []
|
| 1335 |
for i, r in enumerate(regions, 1):
|
| 1336 |
+
cleaned_i, markdown_i, raw_i, _, crops_i = process_image(
|
| 1337 |
r["image"],
|
| 1338 |
task,
|
| 1339 |
custom_prompt,
|
| 1340 |
enable_equation_zoom=enable_equation_zoom,
|
| 1341 |
infer_crop_mode=False,
|
| 1342 |
+
separate_equation_lines=detect_eq_lines,
|
| 1343 |
)
|
| 1344 |
if len(regions) > 1:
|
| 1345 |
cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
|
|
|
|
| 1349 |
cleaned_parts.append(cleaned_i)
|
| 1350 |
markdown_parts.append(markdown_i)
|
| 1351 |
raw_parts.append(raw_i)
|
| 1352 |
+
if detect_eq_lines and crops_i:
|
| 1353 |
+
line_crops.extend(_label_gallery_items(crops_i, prefix=f"Region {i}" if len(regions) > 1 else None))
|
| 1354 |
|
| 1355 |
cleaned = "\n\n".join(cleaned_parts).strip()
|
| 1356 |
markdown = "\n\n".join(markdown_parts).strip()
|
| 1357 |
raw = "\n\n".join(raw_parts).strip()
|
| 1358 |
+
crops = line_crops if line_crops else _region_gallery_items(regions)
|
| 1359 |
full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
|
| 1360 |
region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
|
| 1361 |
img_out = _draw_selected_region_boxes(full_img, region_boxes)
|
|
|
|
| 1365 |
task,
|
| 1366 |
custom_prompt,
|
| 1367 |
enable_equation_zoom=enable_equation_zoom,
|
| 1368 |
+
separate_equation_lines=detect_eq_lines,
|
| 1369 |
)
|
| 1370 |
elif file_path:
|
| 1371 |
cleaned, markdown, raw, img_out, crops = process_file(
|
|
|
|
| 1374 |
custom_prompt,
|
| 1375 |
int(page_num),
|
| 1376 |
enable_equation_zoom=enable_equation_zoom,
|
| 1377 |
+
separate_equation_lines=detect_eq_lines,
|
| 1378 |
)
|
| 1379 |
else:
|
| 1380 |
msg = "Error: Upload a file or image"
|
|
|
|
| 1384 |
|
| 1385 |
submit_event = btn.click(
|
| 1386 |
run,
|
| 1387 |
+
[file_in, task, prompt, page_selector, equation_zoom, separate_eq_lines, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
|
| 1388 |
[text_out, md_out, raw_out, img_out, gallery, download_btn]
|
| 1389 |
)
|
| 1390 |
submit_event.then(select_boxes, [task], [tabs])
|