Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
0f61ca8
1
Parent(s): 0585b7f
Updates application to work directly with PyMuPDF Layout.
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import pymupdf
|
| 2 |
import pymupdf.layout
|
| 3 |
import pymupdf4llm
|
| 4 |
from PIL import Image
|
|
|
|
|
|
|
| 1 |
import pymupdf.layout
|
| 2 |
import pymupdf4llm
|
| 3 |
from PIL import Image
|
my_gui.py
CHANGED
|
@@ -61,14 +61,14 @@ with gr.Blocks(css=custom_css) as my_gui:
|
|
| 61 |
label="Maximum number of pages to convert",
|
| 62 |
info="", precision=0, interactive=True,
|
| 63 |
step=1, elem_id="mySlider")
|
| 64 |
-
|
| 65 |
with gr.Row():
|
| 66 |
checkboxes = gr.CheckboxGroup(["Separate pages", "Embed images"],
|
| 67 |
label="Conversion options")
|
| 68 |
|
| 69 |
with gr.Row():
|
| 70 |
radios = gr.Radio(["lines_strict", "lines", "text"], value="lines_strict", label="Table detection strategy", info="Default is “lines” which uses all vector graphics on the page to detect grid lines.\nStrategy “lines_strict” ignores borderless rectangle vector graphics. Sometimes single text pieces have background colors which may lead to false columns or lines. This strategy ignores them and can thus increase detection precision.\nIf “text” is specified, text positions are used to generate “virtual” column and / or row boundaries.")
|
| 71 |
-
|
| 72 |
with gr.Row():
|
| 73 |
submit_btn = gr.Button("Convert", scale=1, elem_classes=["orange-gradient-btn"])
|
| 74 |
|
|
@@ -91,7 +91,7 @@ with gr.Blocks(css=custom_css) as my_gui:
|
|
| 91 |
inputs=[file_input, page_range_slider],
|
| 92 |
outputs=[gallery]
|
| 93 |
)
|
| 94 |
-
|
| 95 |
submit_btn.click(
|
| 96 |
fn=convertToMD,
|
| 97 |
inputs=[page_range_slider, checkboxes, radios],
|
|
@@ -103,3 +103,15 @@ with gr.Blocks(css=custom_css) as my_gui:
|
|
| 103 |
).success(
|
| 104 |
fn=convertComplete, inputs=[], outputs=[]
|
| 105 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
label="Maximum number of pages to convert",
|
| 62 |
info="", precision=0, interactive=True,
|
| 63 |
step=1, elem_id="mySlider")
|
| 64 |
+
'''
|
| 65 |
with gr.Row():
|
| 66 |
checkboxes = gr.CheckboxGroup(["Separate pages", "Embed images"],
|
| 67 |
label="Conversion options")
|
| 68 |
|
| 69 |
with gr.Row():
|
| 70 |
radios = gr.Radio(["lines_strict", "lines", "text"], value="lines_strict", label="Table detection strategy", info="Default is “lines” which uses all vector graphics on the page to detect grid lines.\nStrategy “lines_strict” ignores borderless rectangle vector graphics. Sometimes single text pieces have background colors which may lead to false columns or lines. This strategy ignores them and can thus increase detection precision.\nIf “text” is specified, text positions are used to generate “virtual” column and / or row boundaries.")
|
| 71 |
+
'''
|
| 72 |
with gr.Row():
|
| 73 |
submit_btn = gr.Button("Convert", scale=1, elem_classes=["orange-gradient-btn"])
|
| 74 |
|
|
|
|
| 91 |
inputs=[file_input, page_range_slider],
|
| 92 |
outputs=[gallery]
|
| 93 |
)
|
| 94 |
+
'''
|
| 95 |
submit_btn.click(
|
| 96 |
fn=convertToMD,
|
| 97 |
inputs=[page_range_slider, checkboxes, radios],
|
|
|
|
| 103 |
).success(
|
| 104 |
fn=convertComplete, inputs=[], outputs=[]
|
| 105 |
)
|
| 106 |
+
'''
|
| 107 |
+
submit_btn.click(
|
| 108 |
+
fn=convertToMD,
|
| 109 |
+
inputs=[page_range_slider],
|
| 110 |
+
outputs=[
|
| 111 |
+
md_result,
|
| 112 |
+
raw_text_result
|
| 113 |
+
],
|
| 114 |
+
queue=False,
|
| 115 |
+
).success(
|
| 116 |
+
fn=convertComplete, inputs=[], outputs=[]
|
| 117 |
+
)
|
my_io.py
CHANGED
|
@@ -25,17 +25,18 @@ def ready(file, page_num:int):
|
|
| 25 |
|
| 26 |
return images
|
| 27 |
|
| 28 |
-
def convertToMD(page_num:int, checkboxes:str, radios:str):
|
| 29 |
|
| 30 |
choice_table_strategy = radios
|
| 31 |
choice_page_separators = False
|
| 32 |
choice_embed_images = False
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
if doc == None:
|
| 41 |
raise gr.Error(message="Please upload a PDF")
|
|
@@ -46,13 +47,7 @@ def convertToMD(page_num:int, checkboxes:str, radios:str):
|
|
| 46 |
print(f"page num={page_num}")
|
| 47 |
page_range = range(0, page_num)
|
| 48 |
md = pymupdf4llm.to_markdown(doc,
|
| 49 |
-
pages = page_range
|
| 50 |
-
write_images = True,
|
| 51 |
-
image_path = "images",
|
| 52 |
-
dpi=100,
|
| 53 |
-
page_separators = choice_page_separators,
|
| 54 |
-
embed_images = choice_embed_images,
|
| 55 |
-
table_strategy = choice_table_strategy)
|
| 56 |
return md, md
|
| 57 |
|
| 58 |
def convertComplete():
|
|
|
|
| 25 |
|
| 26 |
return images
|
| 27 |
|
| 28 |
+
def convertToMD(page_num:int, checkboxes:str = None, radios:str = None):
|
| 29 |
|
| 30 |
choice_table_strategy = radios
|
| 31 |
choice_page_separators = False
|
| 32 |
choice_embed_images = False
|
| 33 |
|
| 34 |
+
if checkboxes is not None:
|
| 35 |
+
for n in checkboxes:
|
| 36 |
+
if n == "Separate pages":
|
| 37 |
+
choice_page_separators = True
|
| 38 |
+
if n == "Embed images":
|
| 39 |
+
choice_embed_images = True
|
| 40 |
|
| 41 |
if doc == None:
|
| 42 |
raise gr.Error(message="Please upload a PDF")
|
|
|
|
| 47 |
print(f"page num={page_num}")
|
| 48 |
page_range = range(0, page_num)
|
| 49 |
md = pymupdf4llm.to_markdown(doc,
|
| 50 |
+
pages = page_range)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
return md, md
|
| 52 |
|
| 53 |
def convertComplete():
|