Spaces:
Sleeping
Sleeping
File size: 4,436 Bytes
3703ba9 d13b416 916651f b3c5160 3703ba9 d23abdf 3703ba9 d23abdf 3703ba9 95de91e d13b416 95de91e d13b416 95de91e d13b416 95de91e d23abdf 02d7a36 d23abdf 95de91e d23abdf 02d7a36 d23abdf 513056d d23abdf 47c7738 d23abdf 02d7a36 d23abdf 513056d d23abdf 47c7738 02d7a36 d23abdf 513056d d23abdf b3c5160 02d7a36 b3c5160 95de91e d13b416 95de91e 3703ba9 d23abdf 3703ba9 d13b416 3703ba9 d13b416 513056d d13b416 3703ba9 d13b416 95de91e 3703ba9 95de91e d13b416 95de91e d13b416 95de91e 513056d 95de91e 3703ba9 d13b416 d23abdf 47c7738 d13b416 95de91e 02d7a36 3703ba9 d23abdf 95de91e a8560ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import gradio as gr
from docling.document_converter import DocumentConverter
import json
import os
import tempfile
# 1) Initialize DocumentConverter
converter = DocumentConverter()
# 2) Conversion function
def convert(file, out_format):
if file is None:
return "Please upload a file first."
try:
# Convert the file
doc = converter.convert(file.name)
# Export based on the selected format
if out_format == "Markdown":
# Save as Markdown file
with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
f.write(doc.document.export_to_markdown())
return temp_filename # Return file for download
elif out_format == "HTML":
# Save as HTML file
with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
f.write(doc.document.export_to_html())
return temp_filename # Return file for download
elif out_format == "JSON":
# Save as JSON file (lossless serialization)
doctags = doc.document.export_to_dict() # Correct method for JSON export
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
json.dump(doctags, f, indent=4)
return temp_filename # Return file for download
elif out_format == "Text":
# Save as Text file (plain text)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
temp_filename = temp_file.name
with open(temp_filename, 'w') as f:
f.write(doc.document.export_to_text()) # Correct method for plain text extraction
return temp_filename # Return file for download
elif out_format == "Doctags":
# Save as Doctags file
with tempfile.NamedTemporaryFile(delete=False, suffix=".doctags") as temp_file:
temp_filename = temp_file.name
doc.document.save_as_doctags(temp_filename)
return temp_filename # Return file for download
else:
return "Unsupported output format"
except Exception as e:
return f"Error converting document: {str(e)}"
# 3) Build Gradio interface
with gr.Blocks() as demo:
# Top: explanatory text about Docling and a link to the official repository
gr.Markdown(
"""
# π Docling Demo
**Docling** is an open-source toolkit for converting various document formats into structured, machine-readable formats. It supports a wide variety of formats including PDF, DOCX, images, and more.
You can choose to convert your document into:
- Markdown
- HTML
- JSON (Lossless serialization of Docling Document)
- Text (Plain text, i.e., without Markdown markers)
- Doctags (A JSON-like structure preserving the document's original format)
Docling is designed to simplify document processing, making it easier to extract content and structure from documents.
π [Visit the official Docling GitHub repository here](https://github.com/DS4SD/docling)
Upload any supported file, choose your output format, and see the result instantly.
"""
)
# Main row: uploader + options + convert button
with gr.Row():
inp = gr.File(
label="π Upload your document",
file_count="single",
file_types=[".pdf", ".docx", ".pptx", ".xlsx",
".html", ".png", ".jpg", ".jpeg", ".tiff",
".wav", ".mp3"]
)
fmt = gr.Dropdown(
choices=["Markdown", "HTML", "JSON", "Text", "Doctags"],
label="π Select output format",
value="Markdown"
)
btn = gr.Button("Convert")
# Output panel (file output for all formats)
out = gr.File(label="π Download Output File")
# Wire it up
btn.click(fn=convert, inputs=[inp, fmt], outputs=out)
# 4) Launch
if __name__ == "__main__":
demo.launch() |