Spaces:
Running on Zero
Running on Zero
Add region OCR UI and clarify Free OCR vs Locate behavior
Browse files
app.py
CHANGED
|
@@ -18,6 +18,8 @@ import latex2mathml.converter
|
|
| 18 |
|
| 19 |
from io import StringIO, BytesIO
|
| 20 |
|
|
|
|
|
|
|
| 21 |
# Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
|
| 22 |
#
|
| 23 |
# Full precision BF16 (~8GB VRAM) — original, highest accuracy
|
|
@@ -745,6 +747,27 @@ def process_file(path, task, custom_prompt, page_num):
|
|
| 745 |
else:
|
| 746 |
return process_image(Image.open(path), task, custom_prompt)
|
| 747 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
def toggle_prompt(task):
|
| 749 |
if task == "✏️ Custom":
|
| 750 |
return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
|
|
@@ -798,6 +821,8 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 798 |
This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
|
| 799 |
""")
|
| 800 |
|
|
|
|
|
|
|
| 801 |
with gr.Row():
|
| 802 |
with gr.Column(scale=1):
|
| 803 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
|
@@ -806,6 +831,16 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 806 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 807 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 808 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 809 |
|
| 810 |
with gr.Column(scale=2):
|
| 811 |
with gr.Tabs() as tabs:
|
|
@@ -825,6 +860,9 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 825 |
gallery = gr.Gallery(show_label=False, columns=3, height=400)
|
| 826 |
with gr.Tab("Raw Text", id="tab_raw"):
|
| 827 |
raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
|
|
|
|
|
|
|
|
|
|
| 828 |
download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
|
| 829 |
|
| 830 |
with gr.Accordion("Image Examples", open=True):
|
|
@@ -854,10 +892,17 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 854 |
|
| 855 |
### Tasks
|
| 856 |
- **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
|
| 857 |
-
- **Free OCR**:
|
| 858 |
-
- **Locate**: Find and highlight specific text
|
| 859 |
- **Describe**: General image description
|
| 860 |
- **Custom**: Your own prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
|
| 862 |
### Special Tokens
|
| 863 |
- `<image>` - Placeholder where visual tokens are inserted
|
|
@@ -871,6 +916,11 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 871 |
page_selector.change(load_image, [file_in, page_selector], [input_img])
|
| 872 |
task.change(toggle_prompt, [task], [prompt])
|
| 873 |
task.change(select_boxes, [task], [tabs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
|
| 875 |
def run(image, file_path, task, custom_prompt, page_num):
|
| 876 |
if file_path:
|
|
|
|
| 18 |
|
| 19 |
from io import StringIO, BytesIO
|
| 20 |
|
| 21 |
+
HAS_IMAGE_EDITOR = hasattr(gr, "ImageEditor")
|
| 22 |
+
|
| 23 |
# Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
|
| 24 |
#
|
| 25 |
# Full precision BF16 (~8GB VRAM) — original, highest accuracy
|
|
|
|
| 747 |
else:
|
| 748 |
return process_image(Image.open(path), task, custom_prompt)
|
| 749 |
|
| 750 |
+
def _extract_editor_image(editor_value):
|
| 751 |
+
if editor_value is None:
|
| 752 |
+
return None
|
| 753 |
+
if isinstance(editor_value, Image.Image):
|
| 754 |
+
return editor_value
|
| 755 |
+
if isinstance(editor_value, dict):
|
| 756 |
+
composite = editor_value.get("composite")
|
| 757 |
+
if isinstance(composite, Image.Image):
|
| 758 |
+
return composite
|
| 759 |
+
background = editor_value.get("background")
|
| 760 |
+
if isinstance(background, Image.Image):
|
| 761 |
+
return background
|
| 762 |
+
return None
|
| 763 |
+
|
| 764 |
+
def process_region_ocr(editor_value):
|
| 765 |
+
image = _extract_editor_image(editor_value)
|
| 766 |
+
if image is None:
|
| 767 |
+
return "Draw/crop a region first, then click OCR Region.", ""
|
| 768 |
+
text, markdown, _, _, _ = process_image(image, "📝 Free OCR", "")
|
| 769 |
+
return text, to_math_html(markdown)
|
| 770 |
+
|
| 771 |
def toggle_prompt(task):
|
| 772 |
if task == "✏️ Custom":
|
| 773 |
return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
|
|
|
|
| 821 |
This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
|
| 822 |
""")
|
| 823 |
|
| 824 |
+
region_editor = None
|
| 825 |
+
region_btn = None
|
| 826 |
with gr.Row():
|
| 827 |
with gr.Column(scale=1):
|
| 828 |
file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
|
|
|
|
| 831 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 832 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 833 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 834 |
+
with gr.Accordion("Region OCR (Draw/Crop)", open=False):
|
| 835 |
+
if HAS_IMAGE_EDITOR:
|
| 836 |
+
region_editor = gr.ImageEditor(
|
| 837 |
+
label="Draw a box and crop to the target area, then click OCR Region",
|
| 838 |
+
type="pil",
|
| 839 |
+
height=300,
|
| 840 |
+
)
|
| 841 |
+
region_btn = gr.Button("OCR Region", variant="secondary")
|
| 842 |
+
else:
|
| 843 |
+
gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
|
| 844 |
|
| 845 |
with gr.Column(scale=2):
|
| 846 |
with gr.Tabs() as tabs:
|
|
|
|
| 860 |
gallery = gr.Gallery(show_label=False, columns=3, height=400)
|
| 861 |
with gr.Tab("Raw Text", id="tab_raw"):
|
| 862 |
raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
|
| 863 |
+
with gr.Tab("Region OCR", id="tab_region"):
|
| 864 |
+
region_text_out = gr.Textbox(lines=12, buttons=["copy"], label="Region OCR Text")
|
| 865 |
+
region_html_out = gr.HTML("")
|
| 866 |
download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
|
| 867 |
|
| 868 |
with gr.Accordion("Image Examples", open=True):
|
|
|
|
| 892 |
|
| 893 |
### Tasks
|
| 894 |
- **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
|
| 895 |
+
- **Free OCR**: Read all visible text from the full page/image (no boxes, no targeting)
|
| 896 |
+
- **Locate**: Find and highlight where specific text appears (grounding ✅)
|
| 897 |
- **Describe**: General image description
|
| 898 |
- **Custom**: Your own prompt
|
| 899 |
+
- **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
|
| 900 |
+
|
| 901 |
+
### Free OCR vs Locate (important)
|
| 902 |
+
- **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
|
| 903 |
+
- If you want OCR for one area only, crop that area first, then run **Free OCR** on the cropped image.
|
| 904 |
+
- If you want to keep the full page but highlight where text appears, use **Locate** and enter the text to search.
|
| 905 |
+
- For advanced region workflows, use **Custom** with `<|grounding|>` in the prompt.
|
| 906 |
|
| 907 |
### Special Tokens
|
| 908 |
- `<image>` - Placeholder where visual tokens are inserted
|
|
|
|
| 916 |
page_selector.change(load_image, [file_in, page_selector], [input_img])
|
| 917 |
task.change(toggle_prompt, [task], [prompt])
|
| 918 |
task.change(select_boxes, [task], [tabs])
|
| 919 |
+
if HAS_IMAGE_EDITOR and region_editor is not None and region_btn is not None:
|
| 920 |
+
file_in.change(load_image, [file_in, page_selector], [region_editor])
|
| 921 |
+
page_selector.change(load_image, [file_in, page_selector], [region_editor])
|
| 922 |
+
input_img.change(lambda img: img, [input_img], [region_editor])
|
| 923 |
+
region_btn.click(process_region_ocr, [region_editor], [region_text_out, region_html_out])
|
| 924 |
|
| 925 |
def run(image, file_path, task, custom_prompt, page_num):
|
| 926 |
if file_path:
|