Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from docling.document_converter import DocumentConverter | |
| import json | |
| import os | |
| import tempfile | |
| # 1) Initialize DocumentConverter | |
| converter = DocumentConverter() | |
| # 2) Conversion function | |
| def convert(file, out_format): | |
| if file is None: | |
| return "Please upload a file first." | |
| try: | |
| # Convert the file | |
| doc = converter.convert(file.name) | |
| # Export based on the selected format | |
| if out_format == "Markdown": | |
| # Save as Markdown file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as temp_file: | |
| temp_filename = temp_file.name | |
| with open(temp_filename, 'w') as f: | |
| f.write(doc.document.export_to_markdown()) | |
| return temp_filename # Return file for download | |
| elif out_format == "HTML": | |
| # Save as HTML file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_file: | |
| temp_filename = temp_file.name | |
| with open(temp_filename, 'w') as f: | |
| f.write(doc.document.export_to_html()) | |
| return temp_filename # Return file for download | |
| elif out_format == "JSON": | |
| # Save as JSON file (lossless serialization) | |
| doctags = doc.document.export_to_dict() # Correct method for JSON export | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file: | |
| temp_filename = temp_file.name | |
| with open(temp_filename, 'w') as f: | |
| json.dump(doctags, f, indent=4) | |
| return temp_filename # Return file for download | |
| elif out_format == "Text": | |
| # Save as Text file (plain text) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file: | |
| temp_filename = temp_file.name | |
| with open(temp_filename, 'w') as f: | |
| f.write(doc.document.export_to_text()) # Correct method for plain text extraction | |
| return temp_filename # Return file for download | |
| elif out_format == "Doctags": | |
| # Save as Doctags file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".doctags") as temp_file: | |
| temp_filename = temp_file.name | |
| doc.document.save_as_doctags(temp_filename) | |
| return temp_filename # Return file for download | |
| else: | |
| return "Unsupported output format" | |
| except Exception as e: | |
| return f"Error converting document: {str(e)}" | |
| # 3) Build Gradio interface | |
| with gr.Blocks() as demo: | |
| # Top: explanatory text about Docling and a link to the official repository | |
| gr.Markdown( | |
| """ | |
| # π Docling Demo | |
| **Docling** is an open-source toolkit for converting various document formats into structured, machine-readable formats. It supports a wide variety of formats including PDF, DOCX, images, and more. | |
| You can choose to convert your document into: | |
| - Markdown | |
| - HTML | |
| - JSON (Lossless serialization of Docling Document) | |
| - Text (Plain text, i.e., without Markdown markers) | |
| - Doctags (A JSON-like structure preserving the document's original format) | |
| Docling is designed to simplify document processing, making it easier to extract content and structure from documents. | |
| π [Visit the official Docling GitHub repository here](https://github.com/DS4SD/docling) | |
| Upload any supported file, choose your output format, and see the result instantly. | |
| """ | |
| ) | |
| # Main row: uploader + options + convert button | |
| with gr.Row(): | |
| inp = gr.File( | |
| label="π Upload your document", | |
| file_count="single", | |
| file_types=[".pdf", ".docx", ".pptx", ".xlsx", | |
| ".html", ".png", ".jpg", ".jpeg", ".tiff", | |
| ".wav", ".mp3"] | |
| ) | |
| fmt = gr.Dropdown( | |
| choices=["Markdown", "HTML", "JSON", "Text", "Doctags"], | |
| label="π Select output format", | |
| value="Markdown" | |
| ) | |
| btn = gr.Button("Convert") | |
| # Output panel (file output for all formats) | |
| out = gr.File(label="π Download Output File") | |
| # Wire it up | |
| btn.click(fn=convert, inputs=[inp, fmt], outputs=out) | |
| # 4) Launch | |
| if __name__ == "__main__": | |
| demo.launch() |