import gradio as gr from docling.document_converter import DocumentConverter import json import os import tempfile # 1) Initialize DocumentConverter converter = DocumentConverter() # 2) Conversion function def convert(file, out_format): if file is None: return "Please upload a file first." try: # Convert the file doc = converter.convert(file.name) # Export based on the selected format if out_format == "Markdown": # Save as Markdown file with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as temp_file: temp_filename = temp_file.name with open(temp_filename, 'w') as f: f.write(doc.document.export_to_markdown()) return temp_filename # Return file for download elif out_format == "HTML": # Save as HTML file with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_file: temp_filename = temp_file.name with open(temp_filename, 'w') as f: f.write(doc.document.export_to_html()) return temp_filename # Return file for download elif out_format == "JSON": # Save as JSON file (lossless serialization) doctags = doc.document.export_to_dict() # Correct method for JSON export with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file: temp_filename = temp_file.name with open(temp_filename, 'w') as f: json.dump(doctags, f, indent=4) return temp_filename # Return file for download elif out_format == "Text": # Save as Text file (plain text) with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file: temp_filename = temp_file.name with open(temp_filename, 'w') as f: f.write(doc.document.export_to_text()) # Correct method for plain text extraction return temp_filename # Return file for download elif out_format == "Doctags": # Save as Doctags file with tempfile.NamedTemporaryFile(delete=False, suffix=".doctags") as temp_file: temp_filename = temp_file.name doc.document.save_as_doctags(temp_filename) return temp_filename # Return file for download else: return "Unsupported output format" except Exception as e: return f"Error converting document: {str(e)}" # 3) Build Gradio interface with gr.Blocks() as demo: # Top: explanatory text about Docling and a link to the official repository gr.Markdown( """ # 📝 Docling Demo **Docling** is an open-source toolkit for converting various document formats into structured, machine-readable formats. It supports a wide variety of formats including PDF, DOCX, images, and more. You can choose to convert your document into: - Markdown - HTML - JSON (Lossless serialization of Docling Document) - Text (Plain text, i.e., without Markdown markers) - Doctags (A JSON-like structure preserving the document's original format) Docling is designed to simplify document processing, making it easier to extract content and structure from documents. 👉 [Visit the official Docling GitHub repository here](https://github.com/DS4SD/docling) Upload any supported file, choose your output format, and see the result instantly. """ ) # Main row: uploader + options + convert button with gr.Row(): inp = gr.File( label="📂 Upload your document", file_count="single", file_types=[".pdf", ".docx", ".pptx", ".xlsx", ".html", ".png", ".jpg", ".jpeg", ".tiff", ".wav", ".mp3"] ) fmt = gr.Dropdown( choices=["Markdown", "HTML", "JSON", "Text", "Doctags"], label="🔄 Select output format", value="Markdown" ) btn = gr.Button("Convert") # Output panel (file output for all formats) out = gr.File(label="📄 Download Output File") # Wire it up btn.click(fn=convert, inputs=[inp, fmt], outputs=out) # 4) Launch if __name__ == "__main__": demo.launch()