Spaces:
Running on Zero
Running on Zero
Make equation multipass optional and integrate region OCR across all outputs
Browse files
app.py
CHANGED
|
@@ -578,7 +578,9 @@ def embed_images(markdown, crops):
|
|
| 578 |
markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n\n\n', 1)
|
| 579 |
return markdown
|
| 580 |
|
| 581 |
-
def _infer_with_prompt(image, prompt):
|
|
|
|
|
|
|
| 582 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
|
| 583 |
image.save(tmp.name, 'JPEG', quality=95)
|
| 584 |
tmp.close()
|
|
@@ -595,7 +597,7 @@ def _infer_with_prompt(image, prompt):
|
|
| 595 |
output_path=out_dir,
|
| 596 |
base_size=BASE_SIZE,
|
| 597 |
image_size=IMAGE_SIZE,
|
| 598 |
-
crop_mode=
|
| 599 |
save_results=False
|
| 600 |
)
|
| 601 |
finally:
|
|
@@ -679,7 +681,7 @@ def _refine_equation_refs(image, raw_text):
|
|
| 679 |
return refined_refs
|
| 680 |
|
| 681 |
@spaces.GPU(duration=90)
|
| 682 |
-
def process_image(image, task, custom_prompt):
|
| 683 |
model.cuda() # GPU is available here — works on ZeroGPU and locally
|
| 684 |
if image is None:
|
| 685 |
return "Error: Upload an image", "", "", None, []
|
|
@@ -699,7 +701,7 @@ def process_image(image, task, custom_prompt):
|
|
| 699 |
else:
|
| 700 |
prompt = TASK_PROMPTS[task]["prompt"]
|
| 701 |
has_grounding = TASK_PROMPTS[task]["has_grounding"]
|
| 702 |
-
result = _infer_with_prompt(image, prompt)
|
| 703 |
|
| 704 |
if not result:
|
| 705 |
return "No text detected", "", "", None, []
|
|
@@ -713,7 +715,7 @@ def process_image(image, task, custom_prompt):
|
|
| 713 |
|
| 714 |
if has_grounding and '<|ref|>' in result:
|
| 715 |
refs = extract_grounding_references(result)
|
| 716 |
-
if task == "📋 Markdown":
|
| 717 |
refs.extend(_refine_equation_refs(image, result))
|
| 718 |
if refs:
|
| 719 |
img_out, crops = draw_bounding_boxes(image, refs, True)
|
|
@@ -726,7 +728,7 @@ def process_image(image, task, custom_prompt):
|
|
| 726 |
return cleaned, markdown, result_for_layout, img_out, crops
|
| 727 |
|
| 728 |
@spaces.GPU(duration=90)
|
| 729 |
-
def process_pdf(path, task, custom_prompt, page_num):
|
| 730 |
doc = fitz.open(path)
|
| 731 |
total_pages = len(doc)
|
| 732 |
if page_num < 1 or page_num > total_pages:
|
|
@@ -737,15 +739,34 @@ def process_pdf(path, task, custom_prompt, page_num):
|
|
| 737 |
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 738 |
doc.close()
|
| 739 |
|
| 740 |
-
return process_image(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
|
| 742 |
-
def process_file(path, task, custom_prompt, page_num):
|
| 743 |
if not path:
|
| 744 |
return "Error: Upload a file", "", "", None, []
|
| 745 |
if path.lower().endswith('.pdf'):
|
| 746 |
-
return process_pdf(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
else:
|
| 748 |
-
return process_image(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 749 |
|
| 750 |
def _extract_editor_image(editor_value):
|
| 751 |
if editor_value is None:
|
|
@@ -761,12 +782,77 @@ def _extract_editor_image(editor_value):
|
|
| 761 |
return background
|
| 762 |
return None
|
| 763 |
|
| 764 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
image = _extract_editor_image(editor_value)
|
| 766 |
if image is None:
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
|
| 771 |
def toggle_prompt(task):
|
| 772 |
if task == "✏️ Custom":
|
|
@@ -829,6 +915,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 829 |
input_img = gr.Image(label="Input Image", type="pil", height=300)
|
| 830 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
| 831 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
|
|
|
| 832 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 833 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 834 |
with gr.Accordion("Region OCR (Draw/Crop)", open=False):
|
|
@@ -897,6 +984,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 897 |
- **Describe**: General image description
|
| 898 |
- **Custom**: Your own prompt
|
| 899 |
- **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
|
|
|
|
| 900 |
|
| 901 |
### Free OCR vs Locate (important)
|
| 902 |
- **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
|
|
@@ -920,35 +1008,39 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 920 |
file_in.change(load_image, [file_in, page_selector], [region_editor])
|
| 921 |
page_selector.change(load_image, [file_in, page_selector], [region_editor])
|
| 922 |
input_img.change(lambda img: img, [input_img], [region_editor])
|
| 923 |
-
region_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 924 |
|
| 925 |
-
def run(image, file_path, task, custom_prompt, page_num):
|
| 926 |
if file_path:
|
| 927 |
-
cleaned, markdown, raw, img_out, crops = process_file(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 928 |
elif image is not None:
|
| 929 |
-
cleaned, markdown, raw, img_out, crops = process_image(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
else:
|
| 931 |
-
|
|
|
|
| 932 |
|
| 933 |
-
|
| 934 |
-
text_display = re.sub(r'\\\[(.+?)\\\]',
|
| 935 |
-
lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
|
| 936 |
-
cleaned, flags=re.DOTALL)
|
| 937 |
-
text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
|
| 938 |
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
mathjax_html = to_mathjax_html(markdown)
|
| 945 |
-
spatial_html = to_spatial_html(raw, markdown)
|
| 946 |
-
|
| 947 |
-
return (text_display, to_math_html(markdown), mathjax_html, mathjax_html, spatial_html, spatial_html, raw, img_out, crops,
|
| 948 |
-
gr.DownloadButton(value=dl_tmp.name, visible=True))
|
| 949 |
-
|
| 950 |
-
submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
|
| 951 |
-
[text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn])
|
| 952 |
submit_event.then(select_boxes, [task], [tabs])
|
| 953 |
|
| 954 |
if __name__ == "__main__":
|
|
|
|
| 578 |
markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n\n\n', 1)
|
| 579 |
return markdown
|
| 580 |
|
| 581 |
+
def _infer_with_prompt(image, prompt, crop_mode=None):
|
| 582 |
+
if crop_mode is None:
|
| 583 |
+
crop_mode = CROP_MODE
|
| 584 |
tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
|
| 585 |
image.save(tmp.name, 'JPEG', quality=95)
|
| 586 |
tmp.close()
|
|
|
|
| 597 |
output_path=out_dir,
|
| 598 |
base_size=BASE_SIZE,
|
| 599 |
image_size=IMAGE_SIZE,
|
| 600 |
+
crop_mode=crop_mode,
|
| 601 |
save_results=False
|
| 602 |
)
|
| 603 |
finally:
|
|
|
|
| 681 |
return refined_refs
|
| 682 |
|
| 683 |
@spaces.GPU(duration=90)
|
| 684 |
+
def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None):
|
| 685 |
model.cuda() # GPU is available here — works on ZeroGPU and locally
|
| 686 |
if image is None:
|
| 687 |
return "Error: Upload an image", "", "", None, []
|
|
|
|
| 701 |
else:
|
| 702 |
prompt = TASK_PROMPTS[task]["prompt"]
|
| 703 |
has_grounding = TASK_PROMPTS[task]["has_grounding"]
|
| 704 |
+
result = _infer_with_prompt(image, prompt, crop_mode=infer_crop_mode)
|
| 705 |
|
| 706 |
if not result:
|
| 707 |
return "No text detected", "", "", None, []
|
|
|
|
| 715 |
|
| 716 |
if has_grounding and '<|ref|>' in result:
|
| 717 |
refs = extract_grounding_references(result)
|
| 718 |
+
if task == "📋 Markdown" and enable_equation_zoom:
|
| 719 |
refs.extend(_refine_equation_refs(image, result))
|
| 720 |
if refs:
|
| 721 |
img_out, crops = draw_bounding_boxes(image, refs, True)
|
|
|
|
| 728 |
return cleaned, markdown, result_for_layout, img_out, crops
|
| 729 |
|
| 730 |
@spaces.GPU(duration=90)
|
| 731 |
+
def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
|
| 732 |
doc = fitz.open(path)
|
| 733 |
total_pages = len(doc)
|
| 734 |
if page_num < 1 or page_num > total_pages:
|
|
|
|
| 739 |
img = Image.open(BytesIO(pix.tobytes("png")))
|
| 740 |
doc.close()
|
| 741 |
|
| 742 |
+
return process_image(
|
| 743 |
+
img,
|
| 744 |
+
task,
|
| 745 |
+
custom_prompt,
|
| 746 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 747 |
+
infer_crop_mode=infer_crop_mode,
|
| 748 |
+
)
|
| 749 |
|
| 750 |
+
def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
|
| 751 |
if not path:
|
| 752 |
return "Error: Upload a file", "", "", None, []
|
| 753 |
if path.lower().endswith('.pdf'):
|
| 754 |
+
return process_pdf(
|
| 755 |
+
path,
|
| 756 |
+
task,
|
| 757 |
+
custom_prompt,
|
| 758 |
+
page_num,
|
| 759 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 760 |
+
infer_crop_mode=infer_crop_mode,
|
| 761 |
+
)
|
| 762 |
else:
|
| 763 |
+
return process_image(
|
| 764 |
+
Image.open(path),
|
| 765 |
+
task,
|
| 766 |
+
custom_prompt,
|
| 767 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 768 |
+
infer_crop_mode=infer_crop_mode,
|
| 769 |
+
)
|
| 770 |
|
| 771 |
def _extract_editor_image(editor_value):
|
| 772 |
if editor_value is None:
|
|
|
|
| 782 |
return background
|
| 783 |
return None
|
| 784 |
|
| 785 |
+
def _dedupe_consecutive_lines(text: str) -> str:
|
| 786 |
+
if not text:
|
| 787 |
+
return text
|
| 788 |
+
out = []
|
| 789 |
+
prev = None
|
| 790 |
+
blank_count = 0
|
| 791 |
+
for line in text.splitlines():
|
| 792 |
+
if not line.strip():
|
| 793 |
+
blank_count += 1
|
| 794 |
+
if blank_count <= 2:
|
| 795 |
+
out.append("")
|
| 796 |
+
continue
|
| 797 |
+
blank_count = 0
|
| 798 |
+
norm = re.sub(r'\s+', ' ', line).strip()
|
| 799 |
+
if norm and norm == prev:
|
| 800 |
+
continue
|
| 801 |
+
out.append(line)
|
| 802 |
+
prev = norm
|
| 803 |
+
return "\n".join(out).strip()
|
| 804 |
+
|
| 805 |
+
def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
|
| 806 |
+
text_display = re.sub(
|
| 807 |
+
r'\\\[(.+?)\\\]',
|
| 808 |
+
lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
|
| 809 |
+
cleaned,
|
| 810 |
+
flags=re.DOTALL
|
| 811 |
+
)
|
| 812 |
+
text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
|
| 813 |
+
|
| 814 |
+
dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
|
| 815 |
+
dl_tmp.write(cleaned)
|
| 816 |
+
dl_tmp.close()
|
| 817 |
+
|
| 818 |
+
markdown_html = to_math_html(markdown)
|
| 819 |
+
mathjax_html = to_mathjax_html(markdown)
|
| 820 |
+
spatial_html = to_spatial_html(raw, markdown)
|
| 821 |
+
|
| 822 |
+
return (
|
| 823 |
+
text_display,
|
| 824 |
+
markdown_html,
|
| 825 |
+
mathjax_html,
|
| 826 |
+
mathjax_html,
|
| 827 |
+
spatial_html,
|
| 828 |
+
spatial_html,
|
| 829 |
+
raw,
|
| 830 |
+
img_out,
|
| 831 |
+
gallery_items,
|
| 832 |
+
gr.DownloadButton(value=dl_tmp.name, visible=True),
|
| 833 |
+
text_display,
|
| 834 |
+
markdown_html,
|
| 835 |
+
)
|
| 836 |
+
|
| 837 |
+
def run_region(editor_value, task, custom_prompt, enable_equation_zoom):
|
| 838 |
image = _extract_editor_image(editor_value)
|
| 839 |
if image is None:
|
| 840 |
+
msg = "Draw/crop a region first, then click OCR Region."
|
| 841 |
+
return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
|
| 842 |
+
|
| 843 |
+
cleaned, markdown, raw, img_out, crops = process_image(
|
| 844 |
+
image,
|
| 845 |
+
task,
|
| 846 |
+
custom_prompt,
|
| 847 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 848 |
+
infer_crop_mode=False,
|
| 849 |
+
)
|
| 850 |
+
|
| 851 |
+
# Region workflows are single-area; collapse obvious duplicate lines.
|
| 852 |
+
cleaned = _dedupe_consecutive_lines(cleaned)
|
| 853 |
+
markdown = _dedupe_consecutive_lines(markdown)
|
| 854 |
+
gallery_items = [image] + (crops or [])
|
| 855 |
+
return _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items)
|
| 856 |
|
| 857 |
def toggle_prompt(task):
|
| 858 |
if task == "✏️ Custom":
|
|
|
|
| 915 |
input_img = gr.Image(label="Input Image", type="pil", height=300)
|
| 916 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
| 917 |
task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
|
| 918 |
+
equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
|
| 919 |
prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
|
| 920 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 921 |
with gr.Accordion("Region OCR (Draw/Crop)", open=False):
|
|
|
|
| 984 |
- **Describe**: General image description
|
| 985 |
- **Custom**: Your own prompt
|
| 986 |
- **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
|
| 987 |
+
- **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
|
| 988 |
|
| 989 |
### Free OCR vs Locate (important)
|
| 990 |
- **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
|
|
|
|
| 1008 |
file_in.change(load_image, [file_in, page_selector], [region_editor])
|
| 1009 |
page_selector.change(load_image, [file_in, page_selector], [region_editor])
|
| 1010 |
input_img.change(lambda img: img, [input_img], [region_editor])
|
| 1011 |
+
region_btn.click(
|
| 1012 |
+
run_region,
|
| 1013 |
+
[region_editor, task, prompt, equation_zoom],
|
| 1014 |
+
[text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
|
| 1015 |
+
)
|
| 1016 |
|
| 1017 |
+
def run(image, file_path, task, custom_prompt, page_num, enable_equation_zoom):
|
| 1018 |
if file_path:
|
| 1019 |
+
cleaned, markdown, raw, img_out, crops = process_file(
|
| 1020 |
+
file_path,
|
| 1021 |
+
task,
|
| 1022 |
+
custom_prompt,
|
| 1023 |
+
int(page_num),
|
| 1024 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 1025 |
+
)
|
| 1026 |
elif image is not None:
|
| 1027 |
+
cleaned, markdown, raw, img_out, crops = process_image(
|
| 1028 |
+
image,
|
| 1029 |
+
task,
|
| 1030 |
+
custom_prompt,
|
| 1031 |
+
enable_equation_zoom=enable_equation_zoom,
|
| 1032 |
+
)
|
| 1033 |
else:
|
| 1034 |
+
msg = "Error: Upload a file or image"
|
| 1035 |
+
return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
|
| 1036 |
|
| 1037 |
+
return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1038 |
|
| 1039 |
+
submit_event = btn.click(
|
| 1040 |
+
run,
|
| 1041 |
+
[input_img, file_in, task, prompt, page_selector, equation_zoom],
|
| 1042 |
+
[text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
|
| 1043 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
submit_event.then(select_boxes, [task], [tabs])
|
| 1045 |
|
| 1046 |
if __name__ == "__main__":
|