ricklon commited on
Commit
c0f56fe
·
1 Parent(s): 0da0e18

Make equation multipass optional and integrate region OCR across all outputs

Browse files
Files changed (1) hide show
  1. app.py +129 -37
app.py CHANGED
@@ -578,7 +578,9 @@ def embed_images(markdown, crops):
578
  markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
579
  return markdown
580
 
581
- def _infer_with_prompt(image, prompt):
 
 
582
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
583
  image.save(tmp.name, 'JPEG', quality=95)
584
  tmp.close()
@@ -595,7 +597,7 @@ def _infer_with_prompt(image, prompt):
595
  output_path=out_dir,
596
  base_size=BASE_SIZE,
597
  image_size=IMAGE_SIZE,
598
- crop_mode=CROP_MODE,
599
  save_results=False
600
  )
601
  finally:
@@ -679,7 +681,7 @@ def _refine_equation_refs(image, raw_text):
679
  return refined_refs
680
 
681
  @spaces.GPU(duration=90)
682
- def process_image(image, task, custom_prompt):
683
  model.cuda() # GPU is available here — works on ZeroGPU and locally
684
  if image is None:
685
  return "Error: Upload an image", "", "", None, []
@@ -699,7 +701,7 @@ def process_image(image, task, custom_prompt):
699
  else:
700
  prompt = TASK_PROMPTS[task]["prompt"]
701
  has_grounding = TASK_PROMPTS[task]["has_grounding"]
702
- result = _infer_with_prompt(image, prompt)
703
 
704
  if not result:
705
  return "No text detected", "", "", None, []
@@ -713,7 +715,7 @@ def process_image(image, task, custom_prompt):
713
 
714
  if has_grounding and '<|ref|>' in result:
715
  refs = extract_grounding_references(result)
716
- if task == "📋 Markdown":
717
  refs.extend(_refine_equation_refs(image, result))
718
  if refs:
719
  img_out, crops = draw_bounding_boxes(image, refs, True)
@@ -726,7 +728,7 @@ def process_image(image, task, custom_prompt):
726
  return cleaned, markdown, result_for_layout, img_out, crops
727
 
728
  @spaces.GPU(duration=90)
729
- def process_pdf(path, task, custom_prompt, page_num):
730
  doc = fitz.open(path)
731
  total_pages = len(doc)
732
  if page_num < 1 or page_num > total_pages:
@@ -737,15 +739,34 @@ def process_pdf(path, task, custom_prompt, page_num):
737
  img = Image.open(BytesIO(pix.tobytes("png")))
738
  doc.close()
739
 
740
- return process_image(img, task, custom_prompt)
 
 
 
 
 
 
741
 
742
- def process_file(path, task, custom_prompt, page_num):
743
  if not path:
744
  return "Error: Upload a file", "", "", None, []
745
  if path.lower().endswith('.pdf'):
746
- return process_pdf(path, task, custom_prompt, page_num)
 
 
 
 
 
 
 
747
  else:
748
- return process_image(Image.open(path), task, custom_prompt)
 
 
 
 
 
 
749
 
750
  def _extract_editor_image(editor_value):
751
  if editor_value is None:
@@ -761,12 +782,77 @@ def _extract_editor_image(editor_value):
761
  return background
762
  return None
763
 
764
- def process_region_ocr(editor_value):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  image = _extract_editor_image(editor_value)
766
  if image is None:
767
- return "Draw/crop a region first, then click OCR Region.", ""
768
- text, markdown, _, _, _ = process_image(image, "📝 Free OCR", "")
769
- return text, to_math_html(markdown)
 
 
 
 
 
 
 
 
 
 
 
 
 
770
 
771
  def toggle_prompt(task):
772
  if task == "✏️ Custom":
@@ -829,6 +915,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
829
  input_img = gr.Image(label="Input Image", type="pil", height=300)
830
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
831
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
 
832
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
833
  btn = gr.Button("Extract", variant="primary", size="lg")
834
  with gr.Accordion("Region OCR (Draw/Crop)", open=False):
@@ -897,6 +984,7 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
897
  - **Describe**: General image description
898
  - **Custom**: Your own prompt
899
  - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
 
900
 
901
  ### Free OCR vs Locate (important)
902
  - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
@@ -920,35 +1008,39 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
920
  file_in.change(load_image, [file_in, page_selector], [region_editor])
921
  page_selector.change(load_image, [file_in, page_selector], [region_editor])
922
  input_img.change(lambda img: img, [input_img], [region_editor])
923
- region_btn.click(process_region_ocr, [region_editor], [region_text_out, region_html_out])
 
 
 
 
924
 
925
- def run(image, file_path, task, custom_prompt, page_num):
926
  if file_path:
927
- cleaned, markdown, raw, img_out, crops = process_file(file_path, task, custom_prompt, int(page_num))
 
 
 
 
 
 
928
  elif image is not None:
929
- cleaned, markdown, raw, img_out, crops = process_image(image, task, custom_prompt)
 
 
 
 
 
930
  else:
931
- return "Error: Upload a file or image", "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False)
 
932
 
933
- # Text tab: convert \[...\] → $$...$$ and \(...\) $...$ for readability
934
- text_display = re.sub(r'\\\[(.+?)\\\]',
935
- lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
936
- cleaned, flags=re.DOTALL)
937
- text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
938
 
939
- # Download file: write cleaned markdown to a temp .md file
940
- dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
941
- dl_tmp.write(cleaned)
942
- dl_tmp.close()
943
-
944
- mathjax_html = to_mathjax_html(markdown)
945
- spatial_html = to_spatial_html(raw, markdown)
946
-
947
- return (text_display, to_math_html(markdown), mathjax_html, mathjax_html, spatial_html, spatial_html, raw, img_out, crops,
948
- gr.DownloadButton(value=dl_tmp.name, visible=True))
949
-
950
- submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
951
- [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn])
952
  submit_event.then(select_boxes, [task], [tabs])
953
 
954
  if __name__ == "__main__":
 
578
  markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
579
  return markdown
580
 
581
+ def _infer_with_prompt(image, prompt, crop_mode=None):
582
+ if crop_mode is None:
583
+ crop_mode = CROP_MODE
584
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
585
  image.save(tmp.name, 'JPEG', quality=95)
586
  tmp.close()
 
597
  output_path=out_dir,
598
  base_size=BASE_SIZE,
599
  image_size=IMAGE_SIZE,
600
+ crop_mode=crop_mode,
601
  save_results=False
602
  )
603
  finally:
 
681
  return refined_refs
682
 
683
  @spaces.GPU(duration=90)
684
+ def process_image(image, task, custom_prompt, enable_equation_zoom=True, infer_crop_mode=None):
685
  model.cuda() # GPU is available here — works on ZeroGPU and locally
686
  if image is None:
687
  return "Error: Upload an image", "", "", None, []
 
701
  else:
702
  prompt = TASK_PROMPTS[task]["prompt"]
703
  has_grounding = TASK_PROMPTS[task]["has_grounding"]
704
+ result = _infer_with_prompt(image, prompt, crop_mode=infer_crop_mode)
705
 
706
  if not result:
707
  return "No text detected", "", "", None, []
 
715
 
716
  if has_grounding and '<|ref|>' in result:
717
  refs = extract_grounding_references(result)
718
+ if task == "📋 Markdown" and enable_equation_zoom:
719
  refs.extend(_refine_equation_refs(image, result))
720
  if refs:
721
  img_out, crops = draw_bounding_boxes(image, refs, True)
 
728
  return cleaned, markdown, result_for_layout, img_out, crops
729
 
730
  @spaces.GPU(duration=90)
731
+ def process_pdf(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
732
  doc = fitz.open(path)
733
  total_pages = len(doc)
734
  if page_num < 1 or page_num > total_pages:
 
739
  img = Image.open(BytesIO(pix.tobytes("png")))
740
  doc.close()
741
 
742
+ return process_image(
743
+ img,
744
+ task,
745
+ custom_prompt,
746
+ enable_equation_zoom=enable_equation_zoom,
747
+ infer_crop_mode=infer_crop_mode,
748
+ )
749
 
750
+ def process_file(path, task, custom_prompt, page_num, enable_equation_zoom=True, infer_crop_mode=None):
751
  if not path:
752
  return "Error: Upload a file", "", "", None, []
753
  if path.lower().endswith('.pdf'):
754
+ return process_pdf(
755
+ path,
756
+ task,
757
+ custom_prompt,
758
+ page_num,
759
+ enable_equation_zoom=enable_equation_zoom,
760
+ infer_crop_mode=infer_crop_mode,
761
+ )
762
  else:
763
+ return process_image(
764
+ Image.open(path),
765
+ task,
766
+ custom_prompt,
767
+ enable_equation_zoom=enable_equation_zoom,
768
+ infer_crop_mode=infer_crop_mode,
769
+ )
770
 
771
  def _extract_editor_image(editor_value):
772
  if editor_value is None:
 
782
  return background
783
  return None
784
 
785
+ def _dedupe_consecutive_lines(text: str) -> str:
786
+ if not text:
787
+ return text
788
+ out = []
789
+ prev = None
790
+ blank_count = 0
791
+ for line in text.splitlines():
792
+ if not line.strip():
793
+ blank_count += 1
794
+ if blank_count <= 2:
795
+ out.append("")
796
+ continue
797
+ blank_count = 0
798
+ norm = re.sub(r'\s+', ' ', line).strip()
799
+ if norm and norm == prev:
800
+ continue
801
+ out.append(line)
802
+ prev = norm
803
+ return "\n".join(out).strip()
804
+
805
+ def _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items):
806
+ text_display = re.sub(
807
+ r'\\\[(.+?)\\\]',
808
+ lambda m: f'\n$$\n{m.group(1).strip()}\n$$\n',
809
+ cleaned,
810
+ flags=re.DOTALL
811
+ )
812
+ text_display = re.sub(r'\\\((.+?)\\\)', lambda m: f'${m.group(1).strip()}$', text_display)
813
+
814
+ dl_tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8')
815
+ dl_tmp.write(cleaned)
816
+ dl_tmp.close()
817
+
818
+ markdown_html = to_math_html(markdown)
819
+ mathjax_html = to_mathjax_html(markdown)
820
+ spatial_html = to_spatial_html(raw, markdown)
821
+
822
+ return (
823
+ text_display,
824
+ markdown_html,
825
+ mathjax_html,
826
+ mathjax_html,
827
+ spatial_html,
828
+ spatial_html,
829
+ raw,
830
+ img_out,
831
+ gallery_items,
832
+ gr.DownloadButton(value=dl_tmp.name, visible=True),
833
+ text_display,
834
+ markdown_html,
835
+ )
836
+
837
+ def run_region(editor_value, task, custom_prompt, enable_equation_zoom):
838
  image = _extract_editor_image(editor_value)
839
  if image is None:
840
+ msg = "Draw/crop a region first, then click OCR Region."
841
+ return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
842
+
843
+ cleaned, markdown, raw, img_out, crops = process_image(
844
+ image,
845
+ task,
846
+ custom_prompt,
847
+ enable_equation_zoom=enable_equation_zoom,
848
+ infer_crop_mode=False,
849
+ )
850
+
851
+ # Region workflows are single-area; collapse obvious duplicate lines.
852
+ cleaned = _dedupe_consecutive_lines(cleaned)
853
+ markdown = _dedupe_consecutive_lines(markdown)
854
+ gallery_items = [image] + (crops or [])
855
+ return _compose_ui_outputs(cleaned, markdown, raw, img_out, gallery_items)
856
 
857
  def toggle_prompt(task):
858
  if task == "✏️ Custom":
 
915
  input_img = gr.Image(label="Input Image", type="pil", height=300)
916
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
917
  task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
918
+ equation_zoom = gr.Checkbox(label="Equation Zoom (multipass)", value=False)
919
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
920
  btn = gr.Button("Extract", variant="primary", size="lg")
921
  with gr.Accordion("Region OCR (Draw/Crop)", open=False):
 
984
  - **Describe**: General image description
985
  - **Custom**: Your own prompt
986
  - **Region OCR (new)**: In the left panel, open **Region OCR (Draw/Crop)**, draw/crop a target area, then click **OCR Region**
987
+ - **Equation Zoom (multipass)**: Optional nested equation refinement for Markdown. Off by default for speed/stability.
988
 
989
  ### Free OCR vs Locate (important)
990
  - **Free OCR does not take a selected region**. It runs OCR on the whole image/page.
 
1008
  file_in.change(load_image, [file_in, page_selector], [region_editor])
1009
  page_selector.change(load_image, [file_in, page_selector], [region_editor])
1010
  input_img.change(lambda img: img, [input_img], [region_editor])
1011
+ region_btn.click(
1012
+ run_region,
1013
+ [region_editor, task, prompt, equation_zoom],
1014
+ [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
1015
+ )
1016
 
1017
+ def run(image, file_path, task, custom_prompt, page_num, enable_equation_zoom):
1018
  if file_path:
1019
+ cleaned, markdown, raw, img_out, crops = process_file(
1020
+ file_path,
1021
+ task,
1022
+ custom_prompt,
1023
+ int(page_num),
1024
+ enable_equation_zoom=enable_equation_zoom,
1025
+ )
1026
  elif image is not None:
1027
+ cleaned, markdown, raw, img_out, crops = process_image(
1028
+ image,
1029
+ task,
1030
+ custom_prompt,
1031
+ enable_equation_zoom=enable_equation_zoom,
1032
+ )
1033
  else:
1034
+ msg = "Error: Upload a file or image"
1035
+ return (msg, "", "", "", "", "", "", None, [], gr.DownloadButton(visible=False), msg, "")
1036
 
1037
+ return _compose_ui_outputs(cleaned, markdown, raw, img_out, crops)
 
 
 
 
1038
 
1039
+ submit_event = btn.click(
1040
+ run,
1041
+ [input_img, file_in, task, prompt, page_selector, equation_zoom],
1042
+ [text_out, md_out, html_out, html_source_out, spatial_out, spatial_source_out, raw_out, img_out, gallery, download_btn, region_text_out, region_html_out]
1043
+ )
 
 
 
 
 
 
 
 
1044
  submit_event.then(select_boxes, [task], [tabs])
1045
 
1046
  if __name__ == "__main__":