File size: 4,513 Bytes
8e3d376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import json

import gradio as gr
from docling.datamodel.base_models import InputFormat
from docling.document_extractor import DocumentExtractor

# Initialize the extractor
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])


def process_extraction(file_input, url_input, template_json):
    """
    Process document extraction with the provided template.

    Args:
        file_input: Uploaded file (PDF or image)
        url_input: URL to a document
        template_json: JSON string defining the extraction template

    Returns:
        JSON string with extracted data
    """
    try:
        # Determine the source
        source = None
        if file_input is not None:
            source = file_input.name
        elif url_input and url_input.strip():
            source = url_input.strip()
        else:
            return json.dumps(
                {"error": "Please provide either a file or a URL"}, indent=2
            )

        # Parse the template JSON
        try:
            template = json.loads(template_json)
        except json.JSONDecodeError as e:
            return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)

        # Perform extraction
        result = extractor.extract(
            source=source,
            template=template,
        )

        # Format the output
        output = {"pages": []}

        for page in result.pages:
            page_data = {
                "page_no": page.page_no,
                "extracted_data": page.extracted_data,
                "raw_text": page.raw_text,
                "errors": page.errors if page.errors else [],
            }
            output["pages"].append(page_data)

        return json.dumps(output, indent=2)

    except Exception as e:
        return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)


# Default template example
default_template = json.dumps(
    {"bill_no": "string", "total": "float", "date": "string"}, indent=2
)

# Create Gradio interface
with gr.Blocks(title="Docling Structured Extraction") as demo:
    gr.Markdown(
        """
    # πŸ“„ Docling Structured Extraction Demo
    
    Extract structured data from documents (PDF/Images) using AI-powered extraction.
    
    **Note:** This feature is currently in beta.
    
    ### How to use:
    1. Upload a file OR provide a URL to a document
    2. Define your extraction template in JSON format
    3. Click "Extract" to get structured data
    """
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input Source")
            file_input = gr.File(
                label="Upload File (PDF or Image)",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
            )
            url_input = gr.Textbox(
                label="Or Enter Document URL",
                placeholder="https://example.com/document.pdf",
                lines=1,
            )

            gr.Markdown("### Extraction Template")
            template_input = gr.Code(
                label="JSON Template", value=default_template, language="json", lines=15
            )

            extract_btn = gr.Button("Extract", variant="primary", size="lg")

        with gr.Column():
            gr.Markdown("### Extracted Data")
            output_json = gr.Code(label="Result (JSON)", language="json", lines=25)

    # Examples section
    gr.Markdown("### Examples")
    gr.Examples(
        examples=[
            [
                None,
                "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
                json.dumps({"bill_no": "string", "total": "float"}, indent=2),
            ],
            [
                None,
                "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
                json.dumps(
                    {
                        "bill_no": "string",
                        "total": "float",
                        "sender_name": "string",
                        "receiver_name": "string",
                    },
                    indent=2,
                ),
            ],
        ],
        inputs=[file_input, url_input, template_input],
        label="Try these examples",
    )

    # Connect the extraction function
    extract_btn.click(
        fn=process_extraction,
        inputs=[file_input, url_input, template_input],
        outputs=output_json,
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()