File size: 4,436 Bytes
3703ba9
 
d13b416
916651f
b3c5160
3703ba9
d23abdf
3703ba9
 
d23abdf
3703ba9
95de91e
 
d13b416
95de91e
d13b416
95de91e
d13b416
 
95de91e
d23abdf
 
 
 
 
02d7a36
d23abdf
95de91e
d23abdf
 
 
 
 
02d7a36
d23abdf
513056d
d23abdf
47c7738
d23abdf
 
 
 
02d7a36
d23abdf
513056d
d23abdf
 
 
 
47c7738
02d7a36
d23abdf
513056d
d23abdf
b3c5160
 
 
02d7a36
b3c5160
95de91e
 
d13b416
95de91e
 
3703ba9
d23abdf
3703ba9
d13b416
3703ba9
 
 
d13b416
 
 
 
 
513056d
 
 
d13b416
 
 
 
 
 
3703ba9
 
d13b416
95de91e
3703ba9
95de91e
d13b416
95de91e
 
d13b416
95de91e
 
 
513056d
95de91e
 
 
3703ba9
d13b416
d23abdf
47c7738
d13b416
95de91e
02d7a36
3703ba9
d23abdf
95de91e
a8560ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from docling.document_converter import DocumentConverter
import json
import os
import tempfile

# 1) Initialize DocumentConverter
converter = DocumentConverter()

# 2) Conversion function
def convert(file, out_format):
    if file is None:
        return "Please upload a file first."

    try:
        # Convert the file
        doc = converter.convert(file.name)

        # Export based on the selected format
        if out_format == "Markdown":
            # Save as Markdown file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".md") as temp_file:
                temp_filename = temp_file.name
                with open(temp_filename, 'w') as f:
                    f.write(doc.document.export_to_markdown())
                return temp_filename  # Return file for download

        elif out_format == "HTML":
            # Save as HTML file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_file:
                temp_filename = temp_file.name
                with open(temp_filename, 'w') as f:
                    f.write(doc.document.export_to_html())
                return temp_filename  # Return file for download

        elif out_format == "JSON":
            # Save as JSON file (lossless serialization)
            doctags = doc.document.export_to_dict()  # Correct method for JSON export
            with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file:
                temp_filename = temp_file.name
                with open(temp_filename, 'w') as f:
                    json.dump(doctags, f, indent=4)
                return temp_filename  # Return file for download

        elif out_format == "Text":
            # Save as Text file (plain text)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
                temp_filename = temp_file.name
                with open(temp_filename, 'w') as f:
                    f.write(doc.document.export_to_text())  # Correct method for plain text extraction
                return temp_filename  # Return file for download

        elif out_format == "Doctags":
            # Save as Doctags file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".doctags") as temp_file:
                temp_filename = temp_file.name
                doc.document.save_as_doctags(temp_filename)
                return temp_filename  # Return file for download

        else:
            return "Unsupported output format"

    except Exception as e:
        return f"Error converting document: {str(e)}"

# 3) Build Gradio interface
with gr.Blocks() as demo:
    # Top: explanatory text about Docling and a link to the official repository
    gr.Markdown(
        """
        # πŸ“ Docling Demo
        **Docling** is an open-source toolkit for converting various document formats into structured, machine-readable formats. It supports a wide variety of formats including PDF, DOCX, images, and more. 

        You can choose to convert your document into:
        - Markdown
        - HTML
        - JSON (Lossless serialization of Docling Document)
        - Text (Plain text, i.e., without Markdown markers)
        - Doctags (A JSON-like structure preserving the document's original format)

        Docling is designed to simplify document processing, making it easier to extract content and structure from documents.

        πŸ‘‰ [Visit the official Docling GitHub repository here](https://github.com/DS4SD/docling)

        Upload any supported file, choose your output format, and see the result instantly.
        """
    )

    # Main row: uploader + options + convert button
    with gr.Row():
        inp = gr.File(
            label="πŸ“‚ Upload your document",
            file_count="single",
            file_types=[".pdf", ".docx", ".pptx", ".xlsx", 
                       ".html", ".png", ".jpg", ".jpeg", ".tiff", 
                       ".wav", ".mp3"]
        )
        fmt = gr.Dropdown(
            choices=["Markdown", "HTML", "JSON", "Text", "Doctags"],
            label="πŸ”„ Select output format",
            value="Markdown"
        )
        btn = gr.Button("Convert")

    # Output panel (file output for all formats)
    out = gr.File(label="πŸ“„ Download Output File")

    # Wire it up
    btn.click(fn=convert, inputs=[inp, fmt], outputs=out)

# 4) Launch
if __name__ == "__main__":
    demo.launch()