JamieLemon commited on
Commit
0f61ca8
·
1 Parent(s): 0585b7f

Updates application to work directly with PyMuPDF Layout.

Browse files
Files changed (3) hide show
  1. app.py +0 -1
  2. my_gui.py +15 -3
  3. my_io.py +8 -13
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import pymupdf
2
  import pymupdf.layout
3
  import pymupdf4llm
4
  from PIL import Image
 
 
1
  import pymupdf.layout
2
  import pymupdf4llm
3
  from PIL import Image
my_gui.py CHANGED
@@ -61,14 +61,14 @@ with gr.Blocks(css=custom_css) as my_gui:
61
  label="Maximum number of pages to convert",
62
  info="", precision=0, interactive=True,
63
  step=1, elem_id="mySlider")
64
-
65
  with gr.Row():
66
  checkboxes = gr.CheckboxGroup(["Separate pages", "Embed images"],
67
  label="Conversion options")
68
 
69
  with gr.Row():
70
  radios = gr.Radio(["lines_strict", "lines", "text"], value="lines_strict", label="Table detection strategy", info="Default is “lines” which uses all vector graphics on the page to detect grid lines.\nStrategy “lines_strict” ignores borderless rectangle vector graphics. Sometimes single text pieces have background colors which may lead to false columns or lines. This strategy ignores them and can thus increase detection precision.\nIf “text” is specified, text positions are used to generate “virtual” column and / or row boundaries.")
71
-
72
  with gr.Row():
73
  submit_btn = gr.Button("Convert", scale=1, elem_classes=["orange-gradient-btn"])
74
 
@@ -91,7 +91,7 @@ with gr.Blocks(css=custom_css) as my_gui:
91
  inputs=[file_input, page_range_slider],
92
  outputs=[gallery]
93
  )
94
-
95
  submit_btn.click(
96
  fn=convertToMD,
97
  inputs=[page_range_slider, checkboxes, radios],
@@ -103,3 +103,15 @@ with gr.Blocks(css=custom_css) as my_gui:
103
  ).success(
104
  fn=convertComplete, inputs=[], outputs=[]
105
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  label="Maximum number of pages to convert",
62
  info="", precision=0, interactive=True,
63
  step=1, elem_id="mySlider")
64
+ '''
65
  with gr.Row():
66
  checkboxes = gr.CheckboxGroup(["Separate pages", "Embed images"],
67
  label="Conversion options")
68
 
69
  with gr.Row():
70
  radios = gr.Radio(["lines_strict", "lines", "text"], value="lines_strict", label="Table detection strategy", info="Default is “lines” which uses all vector graphics on the page to detect grid lines.\nStrategy “lines_strict” ignores borderless rectangle vector graphics. Sometimes single text pieces have background colors which may lead to false columns or lines. This strategy ignores them and can thus increase detection precision.\nIf “text” is specified, text positions are used to generate “virtual” column and / or row boundaries.")
71
+ '''
72
  with gr.Row():
73
  submit_btn = gr.Button("Convert", scale=1, elem_classes=["orange-gradient-btn"])
74
 
 
91
  inputs=[file_input, page_range_slider],
92
  outputs=[gallery]
93
  )
94
+ '''
95
  submit_btn.click(
96
  fn=convertToMD,
97
  inputs=[page_range_slider, checkboxes, radios],
 
103
  ).success(
104
  fn=convertComplete, inputs=[], outputs=[]
105
  )
106
+ '''
107
+ submit_btn.click(
108
+ fn=convertToMD,
109
+ inputs=[page_range_slider],
110
+ outputs=[
111
+ md_result,
112
+ raw_text_result
113
+ ],
114
+ queue=False,
115
+ ).success(
116
+ fn=convertComplete, inputs=[], outputs=[]
117
+ )
my_io.py CHANGED
@@ -25,17 +25,18 @@ def ready(file, page_num:int):
25
 
26
  return images
27
 
28
- def convertToMD(page_num:int, checkboxes:str, radios:str):
29
 
30
  choice_table_strategy = radios
31
  choice_page_separators = False
32
  choice_embed_images = False
33
 
34
- for n in checkboxes:
35
- if n == "Separate pages":
36
- choice_page_separators = True
37
- if n == "Embed images":
38
- choice_embed_images = True
 
39
 
40
  if doc == None:
41
  raise gr.Error(message="Please upload a PDF")
@@ -46,13 +47,7 @@ def convertToMD(page_num:int, checkboxes:str, radios:str):
46
  print(f"page num={page_num}")
47
  page_range = range(0, page_num)
48
  md = pymupdf4llm.to_markdown(doc,
49
- pages = page_range,
50
- write_images = True,
51
- image_path = "images",
52
- dpi=100,
53
- page_separators = choice_page_separators,
54
- embed_images = choice_embed_images,
55
- table_strategy = choice_table_strategy)
56
  return md, md
57
 
58
  def convertComplete():
 
25
 
26
  return images
27
 
28
+ def convertToMD(page_num:int, checkboxes:str = None, radios:str = None):
29
 
30
  choice_table_strategy = radios
31
  choice_page_separators = False
32
  choice_embed_images = False
33
 
34
+ if checkboxes is not None:
35
+ for n in checkboxes:
36
+ if n == "Separate pages":
37
+ choice_page_separators = True
38
+ if n == "Embed images":
39
+ choice_embed_images = True
40
 
41
  if doc == None:
42
  raise gr.Error(message="Please upload a PDF")
 
47
  print(f"page num={page_num}")
48
  page_range = range(0, page_num)
49
  md = pymupdf4llm.to_markdown(doc,
50
+ pages = page_range)
 
 
 
 
 
 
51
  return md, md
52
 
53
  def convertComplete():