Spaces:
Running on Zero
Running on Zero
Label selected regions across boxes and cropped outputs
Browse files
app.py
CHANGED
|
@@ -804,7 +804,66 @@ def _to_rgba_image(obj):
|
|
| 804 |
return Image.fromarray(arr.astype(np.uint8), mode="RGBA")
|
| 805 |
return None
|
| 806 |
|
| 807 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
"""Extract a clean selected region from ImageEditor data.
|
| 809 |
|
| 810 |
Strategy:
|
|
@@ -815,10 +874,11 @@ def _extract_selected_region(editor_value, base_size=None):
|
|
| 815 |
return None
|
| 816 |
if isinstance(editor_value, Image.Image):
|
| 817 |
if base_size and tuple(editor_value.size) == tuple(base_size):
|
| 818 |
-
return None
|
| 819 |
-
|
|
|
|
| 820 |
if not isinstance(editor_value, dict):
|
| 821 |
-
return None
|
| 822 |
|
| 823 |
background = _to_rgba_image(editor_value.get("background"))
|
| 824 |
composite = _to_rgba_image(editor_value.get("composite"))
|
|
@@ -826,14 +886,16 @@ def _extract_selected_region(editor_value, base_size=None):
|
|
| 826 |
|
| 827 |
if background is None:
|
| 828 |
if composite is None:
|
| 829 |
-
return None
|
| 830 |
background = composite
|
| 831 |
|
| 832 |
if not isinstance(layers, list) or not layers:
|
| 833 |
# No annotation layers; treat as explicit crop only if size changed from base.
|
| 834 |
if base_size and tuple(background.size) == tuple(base_size):
|
| 835 |
-
return None
|
| 836 |
-
|
|
|
|
|
|
|
| 837 |
|
| 838 |
alpha_acc = np.zeros((background.height, background.width), dtype=np.uint8)
|
| 839 |
for layer in layers:
|
|
@@ -848,7 +910,7 @@ def _extract_selected_region(editor_value, base_size=None):
|
|
| 848 |
|
| 849 |
ys, xs = np.where(alpha_acc > 0)
|
| 850 |
if xs.size == 0 or ys.size == 0:
|
| 851 |
-
return None
|
| 852 |
|
| 853 |
x1, y1 = int(xs.min()), int(ys.min())
|
| 854 |
x2, y2 = int(xs.max()) + 1, int(ys.max()) + 1
|
|
@@ -859,9 +921,46 @@ def _extract_selected_region(editor_value, base_size=None):
|
|
| 859 |
x2 = min(background.width, x2 + pad_x)
|
| 860 |
y2 = min(background.height, y2 + pad_y)
|
| 861 |
if x2 <= x1 or y2 <= y1:
|
| 862 |
-
return None
|
| 863 |
|
| 864 |
-
return background.crop((x1, y1, x2, y2)).convert("RGB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
|
| 866 |
def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
|
| 867 |
text_display = re.sub(
|
|
@@ -877,16 +976,9 @@ def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
|
|
| 877 |
dl_tmp.close()
|
| 878 |
|
| 879 |
markdown_html = to_math_html(markdown)
|
| 880 |
-
mathjax_html = to_mathjax_html(markdown)
|
| 881 |
-
spatial_html = to_spatial_html(raw, markdown)
|
| 882 |
-
|
| 883 |
return (
|
| 884 |
text_display,
|
| 885 |
markdown_html,
|
| 886 |
-
mathjax_html,
|
| 887 |
-
mathjax_html,
|
| 888 |
-
spatial_html,
|
| 889 |
-
spatial_html,
|
| 890 |
raw,
|
| 891 |
img_out,
|
| 892 |
gallery_items,
|
|
@@ -930,8 +1022,8 @@ def load_image(file_path, page_num=1):
|
|
| 930 |
def load_image_with_size(file_path, page_num=1):
|
| 931 |
img = load_image(file_path, page_num)
|
| 932 |
if img is None:
|
| 933 |
-
return None, None
|
| 934 |
-
return img, (int(img.width), int(img.height))
|
| 935 |
|
| 936 |
def update_page_selector(file_path):
|
| 937 |
if not file_path:
|
|
@@ -954,25 +1046,12 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 954 |
|
| 955 |
region_editor = None
|
| 956 |
workspace_base_size = gr.State(None)
|
|
|
|
|
|
|
| 957 |
with gr.Row():
|
| 958 |
with gr.Column(scale=1):
|
| 959 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
| 960 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
| 961 |
-
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 962 |
-
input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")
|
| 963 |
-
equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
|
| 964 |
-
gr.Markdown(
|
| 965 |
-
"""
|
| 966 |
-
**Quick use**
|
| 967 |
-
1. Load a page/image into the workspace below.
|
| 968 |
-
2. `Entire Page`: click **Extract**.
|
| 969 |
-
3. `Selected Region`: use the **Crop** tool for a rectangle selection, or draw/highlight freehand; then click **Extract**.
|
| 970 |
-
4. Freehand/highlight uses semi-transparent blue ink so text stays visible.
|
| 971 |
-
5. Check **Cropped Images** to confirm the selected region used for OCR.
|
| 972 |
-
"""
|
| 973 |
-
)
|
| 974 |
-
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 975 |
-
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 976 |
gr.Markdown("**Image Workspace (full page + region selection)**")
|
| 977 |
if HAS_IMAGE_EDITOR:
|
| 978 |
editor_kwargs = {}
|
|
@@ -1006,6 +1085,17 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1006 |
else:
|
| 1007 |
gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
|
| 1008 |
region_editor = gr.State(None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1009 |
|
| 1010 |
with gr.Column(scale=2):
|
| 1011 |
with gr.Tabs() as tabs:
|
|
@@ -1013,12 +1103,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1013 |
text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
|
| 1014 |
with gr.Tab("Markdown Preview", id="tab_markdown"):
|
| 1015 |
md_out = gr.HTML("")
|
| 1016 |
-
with gr.Tab("HTML + MathJax", id="tab_html"):
|
| 1017 |
-
html_out = gr.HTML("")
|
| 1018 |
-
html_source_out = gr.Code(label="Generated HTML Source", language="html", lines=16)
|
| 1019 |
-
with gr.Tab("Spatial HTML", id="tab_spatial"):
|
| 1020 |
-
spatial_out = gr.HTML("")
|
| 1021 |
-
spatial_source_out = gr.Code(label="Spatial HTML Source", language="html", lines=16)
|
| 1022 |
with gr.Tab("Boxes", id="tab_boxes"):
|
| 1023 |
img_out = gr.Image(type="pil", height=500, show_label=False)
|
| 1024 |
with gr.Tab("Cropped Images", id="tab_crops"):
|
|
@@ -1028,15 +1112,23 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1028 |
download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
|
| 1029 |
|
| 1030 |
with gr.Accordion("Image Examples", open=True):
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1040 |
|
| 1041 |
with gr.Accordion("PDF Examples", open=True):
|
| 1042 |
gr.Examples(
|
|
@@ -1061,8 +1153,10 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1061 |
- Rectangle selection: use the **Crop** tool.
|
| 1062 |
- Freehand selection: draw/highlight the target; app uses an automatic bounding box around your marks.
|
| 1063 |
- Freehand/highlight ink is semi-transparent so underlying content stays visible.
|
|
|
|
| 1064 |
Then click **Extract**.
|
| 1065 |
-
4.
|
|
|
|
| 1066 |
|
| 1067 |
### Tasks
|
| 1068 |
- **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
|
|
@@ -1091,24 +1185,58 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1091 |
task.change(toggle_prompt, [task], [prompt])
|
| 1092 |
task.change(select_boxes, [task], [tabs])
|
| 1093 |
if HAS_IMAGE_EDITOR and region_editor is not None:
|
| 1094 |
-
file_in.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size])
|
| 1095 |
-
page_selector.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1096 |
|
| 1097 |
-
def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size):
|
| 1098 |
-
selected_region = None
|
| 1099 |
if scope == "Selected Region":
|
| 1100 |
-
|
| 1101 |
-
if
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1112 |
elif (full_image := _extract_editor_background(region_value)) is not None:
|
| 1113 |
cleaned, markdown, raw, img_out, crops = process_image(
|
| 1114 |
full_image,
|
|
@@ -1126,14 +1254,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 1126 |
)
|
| 1127 |
else:
|
| 1128 |
msg = "Error: Upload a file or image"
|
| 1129 |
-
return (msg, "", "",
|
| 1130 |
|
| 1131 |
return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
|
| 1132 |
|
| 1133 |
submit_event = btn.click(
|
| 1134 |
run,
|
| 1135 |
-
[file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size],
|
| 1136 |
-
[text_out, md_out,
|
| 1137 |
)
|
| 1138 |
submit_event.then(select_boxes, [task], [tabs])
|
| 1139 |
|
|
|
|
| 804 |
return Image.fromarray(arr.astype(np.uint8), mode="RGBA")
|
| 805 |
return None
|
| 806 |
|
| 807 |
+
def _locate_patch_bbox(base_image: Image.Image, patch_image: Image.Image):
|
| 808 |
+
"""Approximate patch location in base image using downscaled SSD search."""
|
| 809 |
+
if base_image is None or patch_image is None:
|
| 810 |
+
return None
|
| 811 |
+
base = np.asarray(base_image.convert("L"), dtype=np.float32)
|
| 812 |
+
patch = np.asarray(patch_image.convert("L"), dtype=np.float32)
|
| 813 |
+
bh, bw = base.shape[:2]
|
| 814 |
+
ph, pw = patch.shape[:2]
|
| 815 |
+
if ph <= 0 or pw <= 0 or ph > bh or pw > bw:
|
| 816 |
+
return None
|
| 817 |
+
|
| 818 |
+
max_dim = max(bh, bw)
|
| 819 |
+
scale = min(1.0, 320.0 / max_dim) if max_dim > 0 else 1.0
|
| 820 |
+
if scale < 1.0:
|
| 821 |
+
new_bw = max(1, int(round(bw * scale)))
|
| 822 |
+
new_bh = max(1, int(round(bh * scale)))
|
| 823 |
+
new_pw = max(1, int(round(pw * scale)))
|
| 824 |
+
new_ph = max(1, int(round(ph * scale)))
|
| 825 |
+
base_small = np.asarray(Image.fromarray(base.astype(np.uint8)).resize((new_bw, new_bh), Image.Resampling.BILINEAR), dtype=np.float32)
|
| 826 |
+
patch_small = np.asarray(Image.fromarray(patch.astype(np.uint8)).resize((new_pw, new_ph), Image.Resampling.BILINEAR), dtype=np.float32)
|
| 827 |
+
else:
|
| 828 |
+
base_small = base
|
| 829 |
+
patch_small = patch
|
| 830 |
+
|
| 831 |
+
sbh, sbw = base_small.shape
|
| 832 |
+
sph, spw = patch_small.shape
|
| 833 |
+
if sph > sbh or spw > sbw:
|
| 834 |
+
return None
|
| 835 |
+
|
| 836 |
+
best_score = float("inf")
|
| 837 |
+
best_x = 0
|
| 838 |
+
best_y = 0
|
| 839 |
+
for y in range(sbh - sph + 1):
|
| 840 |
+
row = base_small[y:y + sph, :]
|
| 841 |
+
windows = np.lib.stride_tricks.sliding_window_view(row, spw, axis=1)
|
| 842 |
+
# windows: (sph, sbw-spw+1, spw)
|
| 843 |
+
diff = windows - patch_small[:, None, :]
|
| 844 |
+
scores = np.mean(diff * diff, axis=(0, 2))
|
| 845 |
+
x = int(np.argmin(scores))
|
| 846 |
+
score = float(scores[x])
|
| 847 |
+
if score < best_score:
|
| 848 |
+
best_score = score
|
| 849 |
+
best_x = x
|
| 850 |
+
best_y = y
|
| 851 |
+
|
| 852 |
+
if scale < 1.0:
|
| 853 |
+
x1 = int(round(best_x / scale))
|
| 854 |
+
y1 = int(round(best_y / scale))
|
| 855 |
+
x2 = int(round((best_x + spw) / scale))
|
| 856 |
+
y2 = int(round((best_y + sph) / scale))
|
| 857 |
+
else:
|
| 858 |
+
x1, y1, x2, y2 = best_x, best_y, best_x + spw, best_y + sph
|
| 859 |
+
|
| 860 |
+
x1 = max(0, min(bw - 1, x1))
|
| 861 |
+
y1 = max(0, min(bh - 1, y1))
|
| 862 |
+
x2 = max(x1 + 1, min(bw, x2))
|
| 863 |
+
y2 = max(y1 + 1, min(bh, y2))
|
| 864 |
+
return (x1, y1, x2, y2)
|
| 865 |
+
|
| 866 |
+
def _extract_selected_region(editor_value, base_size=None, base_image=None):
|
| 867 |
"""Extract a clean selected region from ImageEditor data.
|
| 868 |
|
| 869 |
Strategy:
|
|
|
|
| 874 |
return None
|
| 875 |
if isinstance(editor_value, Image.Image):
|
| 876 |
if base_size and tuple(editor_value.size) == tuple(base_size):
|
| 877 |
+
return None, None
|
| 878 |
+
bbox = _locate_patch_bbox(base_image, editor_value) if base_image is not None else None
|
| 879 |
+
return editor_value, bbox
|
| 880 |
if not isinstance(editor_value, dict):
|
| 881 |
+
return None, None
|
| 882 |
|
| 883 |
background = _to_rgba_image(editor_value.get("background"))
|
| 884 |
composite = _to_rgba_image(editor_value.get("composite"))
|
|
|
|
| 886 |
|
| 887 |
if background is None:
|
| 888 |
if composite is None:
|
| 889 |
+
return None, None
|
| 890 |
background = composite
|
| 891 |
|
| 892 |
if not isinstance(layers, list) or not layers:
|
| 893 |
# No annotation layers; treat as explicit crop only if size changed from base.
|
| 894 |
if base_size and tuple(background.size) == tuple(base_size):
|
| 895 |
+
return None, None
|
| 896 |
+
patch = background.convert("RGB")
|
| 897 |
+
bbox = _locate_patch_bbox(base_image, patch) if base_image is not None else None
|
| 898 |
+
return patch, bbox
|
| 899 |
|
| 900 |
alpha_acc = np.zeros((background.height, background.width), dtype=np.uint8)
|
| 901 |
for layer in layers:
|
|
|
|
| 910 |
|
| 911 |
ys, xs = np.where(alpha_acc > 0)
|
| 912 |
if xs.size == 0 or ys.size == 0:
|
| 913 |
+
return None, None
|
| 914 |
|
| 915 |
x1, y1 = int(xs.min()), int(ys.min())
|
| 916 |
x2, y2 = int(xs.max()) + 1, int(ys.max()) + 1
|
|
|
|
| 921 |
x2 = min(background.width, x2 + pad_x)
|
| 922 |
y2 = min(background.height, y2 + pad_y)
|
| 923 |
if x2 <= x1 or y2 <= y1:
|
| 924 |
+
return None, None
|
| 925 |
|
| 926 |
+
return background.crop((x1, y1, x2, y2)).convert("RGB"), (x1, y1, x2, y2)
|
| 927 |
+
|
| 928 |
+
def _draw_selected_region_boxes(image, boxes):
|
| 929 |
+
if image is None or not boxes:
|
| 930 |
+
return None
|
| 931 |
+
refs = []
|
| 932 |
+
w, h = image.size
|
| 933 |
+
for i, b in enumerate(boxes, 1):
|
| 934 |
+
x1, y1, x2, y2 = b
|
| 935 |
+
nx1 = max(0.0, min(999.0, x1 / max(1, w) * 999.0))
|
| 936 |
+
ny1 = max(0.0, min(999.0, y1 / max(1, h) * 999.0))
|
| 937 |
+
nx2 = max(0.0, min(999.0, x2 / max(1, w) * 999.0))
|
| 938 |
+
ny2 = max(0.0, min(999.0, y2 / max(1, h) * 999.0))
|
| 939 |
+
label = f"Region {i}"
|
| 940 |
+
coord_text = repr([[nx1, ny1, nx2, ny2]])
|
| 941 |
+
raw = f'<|ref|>region_{i}<|/ref|><|det|>{coord_text}<|/det|>'
|
| 942 |
+
refs.append((raw, label, coord_text))
|
| 943 |
+
img_out, _ = draw_bounding_boxes(image, refs, extract_images=False)
|
| 944 |
+
return img_out
|
| 945 |
+
|
| 946 |
+
def _region_gallery_items(regions):
|
| 947 |
+
return [(r["image"], f"Region {i}") for i, r in enumerate(regions, 1)]
|
| 948 |
+
|
| 949 |
+
def _reset_selected_regions():
|
| 950 |
+
return [], [], "No saved regions."
|
| 951 |
+
|
| 952 |
+
def add_selected_region(editor_value, base_size, base_image, selected_regions):
|
| 953 |
+
region_img, bbox = _extract_selected_region(editor_value, base_size=base_size, base_image=base_image)
|
| 954 |
+
if region_img is None:
|
| 955 |
+
msg = "No region detected. Use Crop or draw/highlight a region first."
|
| 956 |
+
regions = selected_regions or []
|
| 957 |
+
return regions, _region_gallery_items(regions), msg
|
| 958 |
+
regions = list(selected_regions or [])
|
| 959 |
+
regions.append({"image": region_img, "bbox": bbox})
|
| 960 |
+
return regions, _region_gallery_items(regions), f"{len(regions)} region(s) saved."
|
| 961 |
+
|
| 962 |
+
def clear_selected_regions():
|
| 963 |
+
return _reset_selected_regions()
|
| 964 |
|
| 965 |
def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
|
| 966 |
text_display = re.sub(
|
|
|
|
| 976 |
dl_tmp.close()
|
| 977 |
|
| 978 |
markdown_html = to_math_html(markdown)
|
|
|
|
|
|
|
|
|
|
| 979 |
return (
|
| 980 |
text_display,
|
| 981 |
markdown_html,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 982 |
raw,
|
| 983 |
img_out,
|
| 984 |
gallery_items,
|
|
|
|
| 1022 |
def load_image_with_size(file_path, page_num=1):
|
| 1023 |
img = load_image(file_path, page_num)
|
| 1024 |
if img is None:
|
| 1025 |
+
return None, None, None
|
| 1026 |
+
return img, (int(img.width), int(img.height)), img
|
| 1027 |
|
| 1028 |
def update_page_selector(file_path):
|
| 1029 |
if not file_path:
|
|
|
|
| 1046 |
|
| 1047 |
region_editor = None
|
| 1048 |
workspace_base_size = gr.State(None)
|
| 1049 |
+
workspace_base_image = gr.State(None)
|
| 1050 |
+
selected_regions_state = gr.State([])
|
| 1051 |
with gr.Row():
|
| 1052 |
with gr.Column(scale=1):
|
| 1053 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
| 1054 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1055 |
gr.Markdown("**Image Workspace (full page + region selection)**")
|
| 1056 |
if HAS_IMAGE_EDITOR:
|
| 1057 |
editor_kwargs = {}
|
|
|
|
| 1085 |
else:
|
| 1086 |
gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
|
| 1087 |
region_editor = gr.State(None)
|
| 1088 |
+
input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")
|
| 1089 |
+
selection_controls = gr.Row()
|
| 1090 |
+
with selection_controls:
|
| 1091 |
+
add_region_btn = gr.Button("Add Region", variant="secondary")
|
| 1092 |
+
clear_regions_btn = gr.Button("Clear Regions")
|
| 1093 |
+
selection_status = gr.Textbox(label="Region Selection Status", value="No saved regions.", interactive=False)
|
| 1094 |
+
selected_regions_gallery = gr.Gallery(label="Selected Regions", show_label=True, columns=3, height=170)
|
| 1095 |
+
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 1096 |
+
equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
|
| 1097 |
+
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 1098 |
+
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 1099 |
|
| 1100 |
with gr.Column(scale=2):
|
| 1101 |
with gr.Tabs() as tabs:
|
|
|
|
| 1103 |
text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
|
| 1104 |
with gr.Tab("Markdown Preview", id="tab_markdown"):
|
| 1105 |
md_out = gr.HTML("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1106 |
with gr.Tab("Boxes", id="tab_boxes"):
|
| 1107 |
img_out = gr.Image(type="pil", height=500, show_label=False)
|
| 1108 |
with gr.Tab("Cropped Images", id="tab_crops"):
|
|
|
|
| 1112 |
download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
|
| 1113 |
|
| 1114 |
with gr.Accordion("Image Examples", open=True):
|
| 1115 |
+
image_examples = [
|
| 1116 |
+
["examples/2022-0922 Section 13 Notes.png", "📋 Markdown", ""],
|
| 1117 |
+
["examples/2022-0922 Section 14 Notes.png", "📋 Markdown", ""],
|
| 1118 |
+
["examples/2022-0922 Section 15 Notes.png", "📋 Markdown", ""],
|
| 1119 |
+
]
|
| 1120 |
+
if HAS_IMAGE_EDITOR and region_editor is not None:
|
| 1121 |
+
gr.Examples(
|
| 1122 |
+
examples=image_examples,
|
| 1123 |
+
inputs=[region_editor, task, prompt],
|
| 1124 |
+
cache_examples=False
|
| 1125 |
+
)
|
| 1126 |
+
else:
|
| 1127 |
+
gr.Examples(
|
| 1128 |
+
examples=image_examples,
|
| 1129 |
+
inputs=[file_in, task, prompt],
|
| 1130 |
+
cache_examples=False
|
| 1131 |
+
)
|
| 1132 |
|
| 1133 |
with gr.Accordion("PDF Examples", open=True):
|
| 1134 |
gr.Examples(
|
|
|
|
| 1153 |
- Rectangle selection: use the **Crop** tool.
|
| 1154 |
- Freehand selection: draw/highlight the target; app uses an automatic bounding box around your marks.
|
| 1155 |
- Freehand/highlight ink is semi-transparent so underlying content stays visible.
|
| 1156 |
+
- Optional multi-select: click **Add Region** after each selection.
|
| 1157 |
Then click **Extract**.
|
| 1158 |
+
4. Use **Clear Regions** to reset multi-select state.
|
| 1159 |
+
5. Review **Cropped Images** and **Boxes**: both are labeled `Region 1`, `Region 2`, etc.
|
| 1160 |
|
| 1161 |
### Tasks
|
| 1162 |
- **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
|
|
|
|
| 1185 |
task.change(toggle_prompt, [task], [prompt])
|
| 1186 |
task.change(select_boxes, [task], [tabs])
|
| 1187 |
if HAS_IMAGE_EDITOR and region_editor is not None:
|
| 1188 |
+
file_in.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
|
| 1189 |
+
page_selector.change(load_image_with_size, [file_in, page_selector], [region_editor, workspace_base_size, workspace_base_image])
|
| 1190 |
+
file_in.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
|
| 1191 |
+
page_selector.change(_reset_selected_regions, outputs=[selected_regions_state, selected_regions_gallery, selection_status])
|
| 1192 |
+
|
| 1193 |
+
add_region_btn.click(
|
| 1194 |
+
add_selected_region,
|
| 1195 |
+
[region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
|
| 1196 |
+
[selected_regions_state, selected_regions_gallery, selection_status],
|
| 1197 |
+
)
|
| 1198 |
+
clear_regions_btn.click(
|
| 1199 |
+
clear_selected_regions,
|
| 1200 |
+
outputs=[selected_regions_state, selected_regions_gallery, selection_status],
|
| 1201 |
+
)
|
| 1202 |
|
| 1203 |
+
def run(file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value, base_size, base_image, selected_regions):
|
|
|
|
| 1204 |
if scope == "Selected Region":
|
| 1205 |
+
regions = list(selected_regions or [])
|
| 1206 |
+
if not regions:
|
| 1207 |
+
selected_region, selected_bbox = _extract_selected_region(region_value, base_size=base_size, base_image=base_image)
|
| 1208 |
+
if selected_region is None:
|
| 1209 |
+
msg = "Select Input Scope=Selected Region, then crop or annotate a target area in the Image Workspace first."
|
| 1210 |
+
return (msg, "", "", None, [], gr.DownloadButton(visible=False))
|
| 1211 |
+
regions = [{"image": selected_region, "bbox": selected_bbox}]
|
| 1212 |
+
|
| 1213 |
+
cleaned_parts = []
|
| 1214 |
+
markdown_parts = []
|
| 1215 |
+
raw_parts = []
|
| 1216 |
+
for i, r in enumerate(regions, 1):
|
| 1217 |
+
cleaned_i, markdown_i, raw_i, _, _ = process_image(
|
| 1218 |
+
r["image"],
|
| 1219 |
+
task,
|
| 1220 |
+
custom_prompt,
|
| 1221 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 1222 |
+
infer_crop_mode=False,
|
| 1223 |
+
)
|
| 1224 |
+
if len(regions) > 1:
|
| 1225 |
+
cleaned_parts.append(f"## Region {i}\n\n{cleaned_i}")
|
| 1226 |
+
markdown_parts.append(f"## Region {i}\n\n{markdown_i}")
|
| 1227 |
+
raw_parts.append(f"## Region {i}\n\n{raw_i}")
|
| 1228 |
+
else:
|
| 1229 |
+
cleaned_parts.append(cleaned_i)
|
| 1230 |
+
markdown_parts.append(markdown_i)
|
| 1231 |
+
raw_parts.append(raw_i)
|
| 1232 |
+
|
| 1233 |
+
cleaned = "\n\n".join(cleaned_parts).strip()
|
| 1234 |
+
markdown = "\n\n".join(markdown_parts).strip()
|
| 1235 |
+
raw = "\n\n".join(raw_parts).strip()
|
| 1236 |
+
crops = _region_gallery_items(regions)
|
| 1237 |
+
full_img = base_image if isinstance(base_image, Image.Image) else _extract_editor_background(region_value)
|
| 1238 |
+
region_boxes = [r["bbox"] for r in regions if r.get("bbox") is not None]
|
| 1239 |
+
img_out = _draw_selected_region_boxes(full_img, region_boxes)
|
| 1240 |
elif (full_image := _extract_editor_background(region_value)) is not None:
|
| 1241 |
cleaned, markdown, raw, img_out, crops = process_image(
|
| 1242 |
full_image,
|
|
|
|
| 1254 |
)
|
| 1255 |
else:
|
| 1256 |
msg = "Error: Upload a file or image"
|
| 1257 |
+
return (msg, "", "", None, [], gr.DownloadButton(visible=False))
|
| 1258 |
|
| 1259 |
return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
|
| 1260 |
|
| 1261 |
submit_event = btn.click(
|
| 1262 |
run,
|
| 1263 |
+
[file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor, workspace_base_size, workspace_base_image, selected_regions_state],
|
| 1264 |
+
[text_out, md_out, raw_out, img_out, gallery, download_btn]
|
| 1265 |
)
|
| 1266 |
submit_event.then(select_boxes, [task], [tabs])
|
| 1267 |
|