ricklon commited on
Commit
d6afca6
·
1 Parent(s): f76cb58

Add region OCR UI and clarify Free OCR vs Locate behavior

Browse files
Files changed (1) hide show
  1. app.py +52 -2
app.py CHANGED
@@ -18,6 +18,8 @@ import latex2mathml.converter
18
 
19
  from io import StringIO, BytesIO
20
 
 
 
21
  # Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
22
  #
23
  # Full precision BF16 (~8GB VRAM) — original, highest accuracy
@@ -745,6 +747,27 @@ def process_file(path, task, custom_prompt, page_num):
745
  else:
746
  return process_image(Image.open(path), task, custom_prompt)
747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  def toggle_prompt(task):
749
  if task == "✏️ Custom":
750
  return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
@@ -798,6 +821,8 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
798
  This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
799
  """)
800
 
 
 
801
  with gr.Row():
802
  with gr.Column(scale=1):
803
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
@@ -806,6 +831,16 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
806
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
807
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
808
  btn = gr.Button("Extract", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
809
 
810
  with gr.Column(scale=2):
811
  with gr.Tabs() as tabs:
@@ -825,6 +860,9 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
825
  gallery = gr.Gallery(show_label=False, columns=3, height=400)
826
  with gr.Tab("Raw Text", id="tab_raw"):
827
  raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
 
 
 
828
  download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
829
 
830
  with gr.Accordion("Image Examples", open=True):
@@ -854,10 +892,17 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
854
 
855
  ### Tasks
856
  - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
857
- - **Free OCR**: Simple text extraction without layout
858
- - **Locate**: Find and highlight specific text/elements in image (grounding ✅)
859
  - **Describe**: General image description
860
  - **Custom**: Your own prompt
 
 
 
 
 
 
 
861
 
862
  ### Special Tokens
863
  - `<image>` - Placeholder where visual tokens are inserted
@@ -871,6 +916,11 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
871
  page_selector.change(load_image, [file_in, page_selector], [input_img])
872
  task.change(toggle_prompt, [task], [prompt])
873
  task.change(select_boxes, [task], [tabs])
 
 
 
 
 
874
 
875
  def run(image, file_path, task, custom_prompt, page_num):
876
  if file_path:
 
18
 
19
  from io import StringIO, BytesIO
20
 
21
+ HAS_IMAGE_EDITOR = hasattr(gr, "ImageEditor")
22
+
23
  # Model options — swap MODEL_NAME to reduce VRAM usage on GPUs with <= 8GB
24
  #
25
  # Full precision BF16 (~8GB VRAM) — original, highest accuracy
 
747
  else:
748
  return process_image(Image.open(path), task, custom_prompt)
749
 
750
+ def _extract_editor_image(editor_value):
751
+ if editor_value is None:
752
+ return None
753
+ if isinstance(editor_value, Image.Image):
754
+ return editor_value
755
+ if isinstance(editor_value, dict):
756
+ composite = editor_value.get("composite")
757
+ if isinstance(composite, Image.Image):
758
+ return composite
759
+ background = editor_value.get("background")
760
+ if isinstance(background, Image.Image):
761
+ return background
762
+ return None
763
+
764
+ def process_region_ocr(editor_value):
765
+ image = _extract_editor_image(editor_value)
766
+ if image is None:
767
+ return "Draw/crop a region first, then click OCR Region.", ""
768
+ text, markdown, _, _, _ = process_image(image, "📝 Free OCR", "")
769
+ return text, to_math_html(markdown)
770
+
771
  def toggle_prompt(task):
772
  if task == "✏️ Custom":
773
  return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
 
821
  This fork adds **math rendering** in the Markdown Preview tab so that equations from scanned papers and textbooks display as proper math notation.
822
  """)
823
 
824
+ region_editor = None
825
+ region_btn = None
826
  with gr.Row():
827
  with gr.Column(scale=1):
828
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
 
831
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
832
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
833
  btn = gr.Button("Extract", variant="primary", size="lg")
834
+ with gr.Accordion("Region OCR (Draw/Crop)", open=False):
835
+ if HAS_IMAGE_EDITOR:
836
+ region_editor = gr.ImageEditor(
837
+ label="Draw a box and crop to the target area, then click OCR Region",
838
+ type="pil",
839
+ height=300,
840
+ )
841
+ region_btn = gr.Button("OCR Region", variant="secondary")
842
+ else:
843
+ gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
844
 
845
  with gr.Column(scale=2):
846
  with gr.Tabs() as tabs:
 
860
  gallery = gr.Gallery(show_label=False, columns=3, height=400)
861
  with gr.Tab("Raw Text", id="tab_raw"):
862
  raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
863
+ with gr.Tab("Region OCR", id="tab_region"):
864
+ region_text_out = gr.Textbox(lines=12, buttons=["copy"], label="Region OCR Text")
865
+ region_html_out = gr.HTML("")
866
  download_btn = gr.DownloadButton("Download Markdown", visible=False, variant="secondary")
867
 
868
  with gr.Accordion("Image Examples", open=True):
 
892
 
893
  ### Tasks
894
  - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
895
+ - **Free OCR**: Read all visible text from the full page/image (no boxes, no targeting)
896
+ - **Locate**: Find and highlight where specific text appears (grounding ✅)
897
  - **Describe**: General image description
898
  - **Custom**: Your own prompt
899
+ - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
900
+
901
+ ### Free OCR vs Locate (important)
902
+ - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
903
+ - If you want OCR for one area only, crop that area first, then run **Free OCR** on the cropped image.
904
+ - If you want to keep the full page but highlight where text appears, use **Locate** and enter the text to search.
905
+ - For advanced region workflows, use **Custom** with `<|grounding|>` in the prompt.
906
 
907
  ### Special Tokens
908
  - `<image>` - Placeholder where visual tokens are inserted
 
916
  page_selector.change(load_image, [file_in, page_selector], [input_img])
917
  task.change(toggle_prompt, [task], [prompt])
918
  task.change(select_boxes, [task], [tabs])
919
+ if HAS_IMAGE_EDITOR and region_editor is not None and region_btn is not None:
920
+ file_in.change(load_image, [file_in, page_selector], [region_editor])
921
+ page_selector.change(load_image, [file_in, page_selector], [region_editor])
922
+ input_img.change(lambda img: img, [input_img], [region_editor])
923
+ region_btn.click(process_region_ocr, [region_editor], [region_text_out, region_html_out])
924
 
925
  def run(image, file_path, task, custom_prompt, page_num):
926
  if file_path: