ricklon commited on
Commit
152c5bd
·
1 Parent(s): c0f56fe

Improve faculty-facing region selection guidance in UI

Browse files
Files changed (1) hide show
  1. app.py +39 -6
app.py CHANGED
@@ -915,19 +915,29 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
915
  input_img = gr.Image(label="Input Image", type="pil", height=300)
916
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
917
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
 
918
  equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
 
 
 
 
 
 
 
 
919
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
920
  btn = gr.Button("Extract", variant="primary", size="lg")
921
- with gr.Accordion("Region OCR (Draw/Crop)", open=False):
922
  if HAS_IMAGE_EDITOR:
923
  region_editor = gr.ImageEditor(
924
- label="Draw a box and crop to the target area, then click OCR Region",
925
  type="pil",
926
  height=300,
927
  )
928
  region_btn = gr.Button("OCR Region", variant="secondary")
929
  else:
930
  gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
 
931
 
932
  with gr.Column(scale=2):
933
  with gr.Tabs() as tabs:
@@ -977,13 +987,22 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
977
  ### Configuration
978
  1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.
979
 
 
 
 
 
 
 
 
 
980
  ### Tasks
981
  - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
982
  - **Free OCR**: Read all visible text from the full page/image (no boxes, no targeting)
983
  - **Locate**: Find and highlight where specific text appears (grounding ✅)
984
  - **Describe**: General image description
985
  - **Custom**: Your own prompt
986
- - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
 
987
  - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
988
 
989
  ### Free OCR vs Locate (important)
@@ -1014,8 +1033,22 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1014
  [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
1015
  )
1016
 
1017
- def run(image, file_path, task, custom_prompt, page_num, enable_equation_zoom):
1018
- if file_path:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1019
  cleaned, markdown, raw, img_out, crops = process_file(
1020
  file_path,
1021
  task,
@@ -1038,7 +1071,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
1038
 
1039
  submit_event = btn.click(
1040
  run,
1041
- [input_img, file_in, task, prompt, page_selector, equation_zoom],
1042
  [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
1043
  )
1044
  submit_event.then(select_boxes, [task], [tabs])
 
915
  input_img = gr.Image(label="Input Image", type="pil", height=300)
916
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
917
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
918
+ input_scope = gr.Radio(["Entire Page", "Selected Region"], value="Entire Page", label="Input Scope")
919
  equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
920
+ gr.Markdown(
921
+ """
922
+ **Quick use**
923
+ 1. `Entire Page`: click **Extract**.
924
+ 2. `Selected Region`: open **Region Selector**, draw a box around the target (no painting), crop, then click **Extract**.
925
+ 3. Check **Cropped Images** to confirm the selected region used for OCR.
926
+ """
927
+ )
928
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
929
  btn = gr.Button("Extract", variant="primary", size="lg")
930
+ with gr.Accordion("Region Selector (Draw/Crop)", open=False):
931
  if HAS_IMAGE_EDITOR:
932
  region_editor = gr.ImageEditor(
933
+ label="Draw a rectangle around what you want (do not paint/fill), crop, then run Extract with Input Scope=Selected Region.",
934
  type="pil",
935
  height=300,
936
  )
937
  region_btn = gr.Button("OCR Region", variant="secondary")
938
  else:
939
  gr.Markdown("Region drawing requires a newer Gradio version with `ImageEditor` support.")
940
+ region_editor = gr.State(None)
941
 
942
  with gr.Column(scale=2):
943
  with gr.Tabs() as tabs:
 
987
  ### Configuration
988
  1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.
989
 
990
+ ### Faculty Quick Workflow
991
+ 1. Choose a task (`Markdown`, `Free OCR`, or `Locate`).
992
+ 2. Choose **Input Scope**:
993
+ - `Entire Page` for the full page.
994
+ - `Selected Region` for a specific area.
995
+ 3. For `Selected Region`, open **Region Selector (Draw/Crop)**, draw a box around the target (no painting/fill), crop, then click **Extract**.
996
+ 4. Review **Cropped Images** to confirm the selected region used for OCR.
997
+
998
  ### Tasks
999
  - **Markdown**: Convert document to structured markdown with layout detection (grounding ✅)
1000
  - **Free OCR**: Read all visible text from the full page/image (no boxes, no targeting)
1001
  - **Locate**: Find and highlight where specific text appears (grounding ✅)
1002
  - **Describe**: General image description
1003
  - **Custom**: Your own prompt
1004
+ - **Region OCR (new)**: In the left panel, open **Region Selector (Draw/Crop)**, draw/crop a target area, then click **OCR Region** (or set Input Scope to Selected Region and click Extract)
1005
+ - **Input Scope**: `Entire Page` or `Selected Region` (Selected Region uses the Region Selector crop as main input)
1006
  - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
1007
 
1008
  ### Free OCR vs Locate (important)
 
1033
  [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
1034
  )
1035
 
1036
+ def run(image, file_path, task, custom_prompt, page_num, enable_equation_zoom, scope, region_value):
1037
+ selected_region = None
1038
+ if scope == "Selected Region":
1039
+ selected_region = _extract_editor_image(region_value)
1040
+ if selected_region is None:
1041
+ msg = "Select Input Scope=Selected Region, then draw/crop in Region Selector first."
1042
+ return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
1043
+ cleaned, markdown, raw, img_out, crops = process_image(
1044
+ selected_region,
1045
+ task,
1046
+ custom_prompt,
1047
+ enable_equation_zoom=enable_equation_zoom,
1048
+ infer_crop_mode=False,
1049
+ )
1050
+ crops = [selected_region] + (crops or [])
1051
+ elif file_path:
1052
  cleaned, markdown, raw, img_out, crops = process_file(
1053
  file_path,
1054
  task,
 
1071
 
1072
  submit_event = btn.click(
1073
  run,
1074
+ [input_img, file_in, task, prompt, page_selector, equation_zoom, input_scope, region_editor],
1075
  [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
1076
  )
1077
  submit_event.then(select_boxes, [task], [tabs])