github-actions[bot]
commited on
Commit
·
0cfb559
1
Parent(s):
fa4d37c
Sync with https://github.com/mozilla-ai/document-to-markdown
Browse files
app.py
CHANGED
|
@@ -1,50 +1,49 @@
|
|
|
|
|
| 1 |
from typing import Dict, Tuple
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
| 4 |
-
import torch.cuda
|
| 5 |
from docling.datamodel.base_models import InputFormat
|
| 6 |
-
from docling.datamodel.pipeline_options import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 8 |
from docling_core.types import DoclingDocument
|
| 9 |
from docling.utils import model_downloader
|
| 10 |
-
from docling.datamodel.pipeline_options import smolvlm_picture_description
|
| 11 |
|
| 12 |
# Download models upon HF space initialization
|
| 13 |
-
pipeline_options = PdfPipelineOptions()
|
| 14 |
-
if torch.cuda.is_available():
|
| 15 |
-
print("Enabling CUDA Accelerator")
|
| 16 |
-
pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
| 17 |
-
pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
| 18 |
if os.getenv("IS_HF_SPACE"):
|
| 19 |
print("Downloading models...")
|
| 20 |
model_downloader.download_models()
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def parse_document(
|
| 24 |
file_path: str,
|
|
|
|
| 25 |
do_code_enrichment: bool,
|
| 26 |
do_formula_enrichment: bool,
|
| 27 |
-
do_picture_classification: bool,
|
| 28 |
-
do_picture_description: bool,
|
| 29 |
) -> Tuple[DoclingDocument, str]:
|
| 30 |
yield None, f"Parsing document... ⏳"
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
pipeline_options.do_picture_classification = do_picture_classification
|
| 37 |
-
|
| 38 |
-
pipeline_options.do_picture_description = do_picture_description
|
| 39 |
-
pipeline_options.picture_description_options = smolvlm_picture_description
|
| 40 |
-
pipeline_options.picture_description_options.prompt = "Describe the image in three sentences. Be concise and accurate."
|
| 41 |
-
pipeline_options.images_scale = 2.0
|
| 42 |
-
pipeline_options.generate_picture_images = True
|
| 43 |
|
| 44 |
-
print(f"Pipeline options defined: \n\t{
|
| 45 |
converter = DocumentConverter(
|
| 46 |
format_options={
|
| 47 |
-
InputFormat.PDF: PdfFormatOption(pipeline_options=
|
| 48 |
}
|
| 49 |
)
|
| 50 |
|
|
@@ -53,20 +52,31 @@ def parse_document(
|
|
| 53 |
yield result.document, "Done ✅"
|
| 54 |
|
| 55 |
|
| 56 |
-
def to_html(docling_doc: DoclingDocument) -> str:
|
| 57 |
-
return docling_doc.export_to_html()
|
|
|
|
| 58 |
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
def to_markdown(docling_doc: DoclingDocument) -> str:
|
| 61 |
-
return docling_doc.export_to_markdown()
|
| 62 |
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
def to_json(docling_doc: DoclingDocument) -> Dict:
|
| 65 |
-
return docling_doc.export_to_dict()
|
| 66 |
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
def upload_file(file) -> str:
|
|
@@ -80,10 +90,6 @@ def setup_gradio_demo():
|
|
| 80 |
|
| 81 |
Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
|
| 82 |
|
| 83 |
-
Model used for picture classification: [EfficientNet-B0 Document Image Classifier](https://huggingface.co/ds4sd/DocumentFigureClassifier)
|
| 84 |
-
|
| 85 |
-
Model used for picture description: [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
|
| 86 |
-
|
| 87 |
To explore the full set of features of Docling visit: https://github.com/docling-project/docling
|
| 88 |
"""
|
| 89 |
)
|
|
@@ -110,22 +116,18 @@ def setup_gradio_demo():
|
|
| 110 |
)
|
| 111 |
|
| 112 |
with gr.Column():
|
| 113 |
-
gr.Markdown("### 2) Configure engine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
code_understanding = gr.Checkbox(
|
| 115 |
value=False, label="Enable Code understanding"
|
| 116 |
)
|
| 117 |
formula_enrichment = gr.Checkbox(
|
| 118 |
value=False, label="Enable Formula understanding"
|
| 119 |
)
|
| 120 |
-
picture_classification = gr.Checkbox(
|
| 121 |
-
value=False, label="Enable Picture classification"
|
| 122 |
-
)
|
| 123 |
-
picture_description = gr.Checkbox(
|
| 124 |
-
value=False, label="Enable Picture description"
|
| 125 |
-
)
|
| 126 |
-
gr.Markdown(
|
| 127 |
-
"_**Warning:** Enabling any of these features can potentially increase the processing time._"
|
| 128 |
-
)
|
| 129 |
|
| 130 |
parse_button = gr.Button("Parse document")
|
| 131 |
status = gr.Markdown()
|
|
@@ -136,40 +138,74 @@ def setup_gradio_demo():
|
|
| 136 |
markdown_button = gr.Button("Convert to markdown")
|
| 137 |
json_button = gr.Button("Convert to JSON")
|
| 138 |
text_button = gr.Button("Convert to text")
|
|
|
|
| 139 |
|
| 140 |
doc = gr.State()
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
parse_button.click(
|
| 144 |
fn=parse_document,
|
| 145 |
inputs=[
|
| 146 |
file_output,
|
|
|
|
| 147 |
code_understanding,
|
| 148 |
formula_enrichment,
|
| 149 |
-
picture_classification,
|
| 150 |
-
picture_description,
|
| 151 |
],
|
| 152 |
outputs=[doc, status],
|
| 153 |
)
|
| 154 |
html_button.click(
|
| 155 |
fn=to_html,
|
| 156 |
inputs=doc,
|
| 157 |
-
outputs=output,
|
| 158 |
)
|
| 159 |
markdown_button.click(
|
| 160 |
fn=to_markdown,
|
| 161 |
inputs=doc,
|
| 162 |
-
outputs=output,
|
| 163 |
)
|
| 164 |
json_button.click(
|
| 165 |
fn=to_json,
|
| 166 |
inputs=doc,
|
| 167 |
-
outputs=output,
|
| 168 |
)
|
| 169 |
text_button.click(
|
| 170 |
fn=to_text,
|
| 171 |
inputs=doc,
|
| 172 |
-
outputs=output,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
)
|
| 174 |
|
| 175 |
demo.launch()
|
|
|
|
| 1 |
+
import json
|
| 2 |
from typing import Dict, Tuple
|
| 3 |
import os
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
from docling.datamodel.base_models import InputFormat
|
| 6 |
+
from docling.datamodel.pipeline_options import (
|
| 7 |
+
PdfPipelineOptions,
|
| 8 |
+
EasyOcrOptions,
|
| 9 |
+
TesseractOcrOptions,
|
| 10 |
+
RapidOcrOptions,
|
| 11 |
+
OcrMacOptions,
|
| 12 |
+
)
|
| 13 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 14 |
from docling_core.types import DoclingDocument
|
| 15 |
from docling.utils import model_downloader
|
|
|
|
| 16 |
|
| 17 |
# Download models upon HF space initialization
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
if os.getenv("IS_HF_SPACE"):
|
| 19 |
print("Downloading models...")
|
| 20 |
model_downloader.download_models()
|
| 21 |
|
| 22 |
+
engines_available = {
|
| 23 |
+
"EasyOCR (Default)": EasyOcrOptions(),
|
| 24 |
+
"Tesseract": TesseractOcrOptions(),
|
| 25 |
+
"RapidOCR": RapidOcrOptions(),
|
| 26 |
+
"OcrMac (Mac only)": OcrMacOptions(),
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
|
| 30 |
def parse_document(
|
| 31 |
file_path: str,
|
| 32 |
+
engine: str,
|
| 33 |
do_code_enrichment: bool,
|
| 34 |
do_formula_enrichment: bool,
|
|
|
|
|
|
|
| 35 |
) -> Tuple[DoclingDocument, str]:
|
| 36 |
yield None, f"Parsing document... ⏳"
|
| 37 |
|
| 38 |
+
pdf_pipeline_options = PdfPipelineOptions()
|
| 39 |
+
pdf_pipeline_options.ocr_options = engines_available[engine]
|
| 40 |
+
pdf_pipeline_options.do_code_enrichment = do_code_enrichment
|
| 41 |
+
pdf_pipeline_options.do_formula_enrichment = do_formula_enrichment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
print(f"PDF Pipeline options defined: \n\t{pdf_pipeline_options}")
|
| 44 |
converter = DocumentConverter(
|
| 45 |
format_options={
|
| 46 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options)
|
| 47 |
}
|
| 48 |
)
|
| 49 |
|
|
|
|
| 52 |
yield result.document, "Done ✅"
|
| 53 |
|
| 54 |
|
| 55 |
+
def to_html(docling_doc: DoclingDocument) -> Tuple[str, str]:
|
| 56 |
+
return docling_doc.export_to_html(), "html"
|
| 57 |
+
|
| 58 |
|
| 59 |
+
def to_markdown(docling_doc: DoclingDocument) -> Tuple[str, str]:
|
| 60 |
+
return docling_doc.export_to_markdown(), "md"
|
| 61 |
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
def to_json(docling_doc: DoclingDocument) -> Tuple[Dict, str]:
|
| 64 |
+
return docling_doc.export_to_dict(), "json"
|
| 65 |
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
def to_text(docling_doc: DoclingDocument) -> Tuple[str, str]:
|
| 68 |
+
return docling_doc.export_to_text(), "txt"
|
| 69 |
|
| 70 |
+
|
| 71 |
+
def download_file(doc: str | Dict, file_extension: str):
|
| 72 |
+
final_filename = f"doc.{file_extension}"
|
| 73 |
+
if file_extension == "json":
|
| 74 |
+
with open(final_filename, "w") as json_file:
|
| 75 |
+
json.dump(doc, json_file, indent=4)
|
| 76 |
+
else:
|
| 77 |
+
with open(final_filename, "w") as file:
|
| 78 |
+
file.write(doc)
|
| 79 |
+
return [final_filename, "Downloaded ✅"]
|
| 80 |
|
| 81 |
|
| 82 |
def upload_file(file) -> str:
|
|
|
|
| 90 |
|
| 91 |
Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
To explore the full set of features of Docling visit: https://github.com/docling-project/docling
|
| 94 |
"""
|
| 95 |
)
|
|
|
|
| 116 |
)
|
| 117 |
|
| 118 |
with gr.Column():
|
| 119 |
+
gr.Markdown("### 2) Configure engine (Only applicable for PDF files)")
|
| 120 |
+
|
| 121 |
+
ocr_engine = gr.Dropdown(
|
| 122 |
+
choices=list(engines_available.keys()), label="Select OCR engine"
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
code_understanding = gr.Checkbox(
|
| 126 |
value=False, label="Enable Code understanding"
|
| 127 |
)
|
| 128 |
formula_enrichment = gr.Checkbox(
|
| 129 |
value=False, label="Enable Formula understanding"
|
| 130 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
parse_button = gr.Button("Parse document")
|
| 133 |
status = gr.Markdown()
|
|
|
|
| 138 |
markdown_button = gr.Button("Convert to markdown")
|
| 139 |
json_button = gr.Button("Convert to JSON")
|
| 140 |
text_button = gr.Button("Convert to text")
|
| 141 |
+
file_extension = gr.Text(visible=False)
|
| 142 |
|
| 143 |
doc = gr.State()
|
| 144 |
+
with gr.Column():
|
| 145 |
+
with gr.Group():
|
| 146 |
+
output = gr.Textbox(
|
| 147 |
+
label="Output",
|
| 148 |
+
lines=10,
|
| 149 |
+
interactive=False,
|
| 150 |
+
elem_id="output-textbox",
|
| 151 |
+
)
|
| 152 |
+
gr.HTML(
|
| 153 |
+
"""
|
| 154 |
+
<div style="display: flex; flex-direction: column; align-items: center;">
|
| 155 |
+
<button id="copy-button" onclick="const text = document.getElementById('output-textbox').querySelector('textarea').value; navigator.clipboard.writeText(text); const copiedMsg = document.getElementById('copied-msg'); copiedMsg.style.display = 'inline'; setTimeout(() => copiedMsg.style.display = 'none', 1500);" style="margin-top: 10px;">
|
| 156 |
+
📋 Copy output to clipboard
|
| 157 |
+
</button>
|
| 158 |
+
<span id="copied-msg" style="margin-left: 10px; color: green; display: none;">Copied!</span>
|
| 159 |
+
</div>
|
| 160 |
+
"""
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
download_button = gr.Button("Download to file")
|
| 164 |
+
# See https://github.com/gradio-app/gradio/issues/9230#issuecomment-2323771634 why this button
|
| 165 |
+
download_button_hidden = gr.DownloadButton(
|
| 166 |
+
visible=False, elem_id="download_btn_hidden"
|
| 167 |
+
)
|
| 168 |
+
download_status = gr.Markdown()
|
| 169 |
|
| 170 |
parse_button.click(
|
| 171 |
fn=parse_document,
|
| 172 |
inputs=[
|
| 173 |
file_output,
|
| 174 |
+
ocr_engine,
|
| 175 |
code_understanding,
|
| 176 |
formula_enrichment,
|
|
|
|
|
|
|
| 177 |
],
|
| 178 |
outputs=[doc, status],
|
| 179 |
)
|
| 180 |
html_button.click(
|
| 181 |
fn=to_html,
|
| 182 |
inputs=doc,
|
| 183 |
+
outputs=[output, file_extension],
|
| 184 |
)
|
| 185 |
markdown_button.click(
|
| 186 |
fn=to_markdown,
|
| 187 |
inputs=doc,
|
| 188 |
+
outputs=[output, file_extension],
|
| 189 |
)
|
| 190 |
json_button.click(
|
| 191 |
fn=to_json,
|
| 192 |
inputs=doc,
|
| 193 |
+
outputs=[output, file_extension],
|
| 194 |
)
|
| 195 |
text_button.click(
|
| 196 |
fn=to_text,
|
| 197 |
inputs=doc,
|
| 198 |
+
outputs=[output, file_extension],
|
| 199 |
+
)
|
| 200 |
+
download_button.click(
|
| 201 |
+
fn=download_file,
|
| 202 |
+
inputs=[output, file_extension],
|
| 203 |
+
outputs=[download_button_hidden, download_status],
|
| 204 |
+
).then(
|
| 205 |
+
fn=None,
|
| 206 |
+
inputs=None,
|
| 207 |
+
outputs=None,
|
| 208 |
+
js="() => document.querySelector('#download_btn_hidden').click()",
|
| 209 |
)
|
| 210 |
|
| 211 |
demo.launch()
|