docling / app.py
aimal-khan's picture
Update app.py
a8560ed verified
import gradio as gr
from docling.document_converter import DocumentConverter
import json
import os
import tempfile
# 1) Initialize DocumentConverter
converter = DocumentConverter()
# 2) Conversion function
def convert(file, out_format):
if file is None:
return "Please upload a file first."
try:
# Convert the file
doc = converter.convert(file.name)
# Export based on the selected format
if out_format == "Markdown":
# Save as Markdown file
with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
f.write(doc.document.export_to_markdown())
return temp_filename # Return file for download
elif out_format == "HTML":
# Save as HTML file
with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
f.write(doc.document.export_to_html())
return temp_filename # Return file for download
elif out_format == "JSON":
# Save as JSON file (lossless serialization)
doctags = doc.document.export_to_dict() # Correct method for JSON export
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
json.dump(doctags, f, indent=4)
return temp_filename # Return file for download
elif out_format == "Text":
# Save as Text file (plain text)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
f.write(doc.document.export_to_text()) # Correct method for plain text extraction
return temp_filename # Return file for download
elif out_format == "Doctags":
# Save as Doctags file
with tempfile.NamedTemporaryFile(delete=False, suffix=".doctags") as temp_file:
temp_filename = temp_file.name
doc.document.save_as_doctags(temp_filename)
return temp_filename # Return file for download
else:
return "Unsupported output format"
except Exception as e:
return f"Error converting document: {str(e)}"
# 3) Build Gradio interface
with gr.Blocks() as demo:
# Top: explanatory text about Docling and a link to the official repository
gr.Markdown(
"""
# πŸ“ Docling Demo
**Docling** is an open-source toolkit for converting various document formats into structured, machine-readable formats. It supports a wide variety of formats including PDF, DOCX, images, and more.
You can choose to convert your document into:
- Markdown
- HTML
- JSON (Lossless serialization of Docling Document)
- Text (Plain text, i.e., without Markdown markers)
- Doctags (A JSON-like structure preserving the document's original format)
Docling is designed to simplify document processing, making it easier to extract content and structure from documents.
πŸ‘‰ [Visit the official Docling GitHub repository here](https://github.com/DS4SD/docling)
Upload any supported file, choose your output format, and see the result instantly.
"""
)
# Main row: uploader + options + convert button
with gr.Row():
inp = gr.File(
label="πŸ“‚ Upload your document",
file_count="single",
file_types=[".pdf", ".docx", ".pptx", ".xlsx",
".html", ".png", ".jpg", ".jpeg", ".tiff",
".wav", ".mp3"]
)
fmt = gr.Dropdown(
choices=["Markdown", "HTML", "JSON", "Text", "Doctags"],
label="πŸ”„ Select output format",
value="Markdown"
)
btn = gr.Button("Convert")
# Output panel (file output for all formats)
out = gr.File(label="πŸ“„ Download Output File")
# Wire it up
btn.click(fn=convert, inputs=[inp, fmt], outputs=out)
# 4) Launch
if __name__ == "__main__":
demo.launch()