Spaces:

flozi00
/

structured-docling

Running on Zero

File size: 4,513 Bytes

8e3d376

import json

import gradio as gr
from docling.datamodel.base_models import InputFormat
from docling.document_extractor import DocumentExtractor

# Initialize the extractor
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])


def process_extraction(file_input, url_input, template_json):
    """
    Process document extraction with the provided template.

    Args:
        file_input: Uploaded file (PDF or image)
        url_input: URL to a document
        template_json: JSON string defining the extraction template

    Returns:
        JSON string with extracted data
    """
    try:
        # Determine the source
        source = None
        if file_input is not None:
            source = file_input.name
        elif url_input and url_input.strip():
            source = url_input.strip()
        else:
            return json.dumps(
                {"error": "Please provide either a file or a URL"}, indent=2
            )

        # Parse the template JSON
        try:
            template = json.loads(template_json)
        except json.JSONDecodeError as e:
            return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)

        # Perform extraction
        result = extractor.extract(
            source=source,
            template=template,
        )

        # Format the output
        output = {"pages": []}

        for page in result.pages:
            page_data = {
                "page_no": page.page_no,
                "extracted_data": page.extracted_data,
                "raw_text": page.raw_text,
                "errors": page.errors if page.errors else [],
            }
            output["pages"].append(page_data)

        return json.dumps(output, indent=2)

    except Exception as e:
        return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)


# Default template example
default_template = json.dumps(
    {"bill_no": "string", "total": "float", "date": "string"}, indent=2
)

# Create Gradio interface
with gr.Blocks(title="Docling Structured Extraction") as demo:
    gr.Markdown(
        """
    # 📄 Docling Structured Extraction Demo
    
    Extract structured data from documents (PDF/Images) using AI-powered extraction.
    
    **Note:** This feature is currently in beta.
    
    ### How to use:
    1. Upload a file OR provide a URL to a document
    2. Define your extraction template in JSON format
    3. Click "Extract" to get structured data
    """
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Input Source")
            file_input = gr.File(
                label="Upload File (PDF or Image)",
                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
            )
            url_input = gr.Textbox(
                label="Or Enter Document URL",
                placeholder="https://example.com/document.pdf",
                lines=1,
            )

            gr.Markdown("### Extraction Template")
            template_input = gr.Code(
                label="JSON Template", value=default_template, language="json", lines=15
            )

            extract_btn = gr.Button("Extract", variant="primary", size="lg")

        with gr.Column():
            gr.Markdown("### Extracted Data")
            output_json = gr.Code(label="Result (JSON)", language="json", lines=25)

    # Examples section
    gr.Markdown("### Examples")
    gr.Examples(
        examples=[
            [
                None,
                "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
                json.dumps({"bill_no": "string", "total": "float"}, indent=2),
            ],
            [
                None,
                "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
                json.dumps(
                    {
                        "bill_no": "string",
                        "total": "float",
                        "sender_name": "string",
                        "receiver_name": "string",
                    },
                    indent=2,
                ),
            ],
        ],
        inputs=[file_input, url_input, template_input],
        label="Try these examples",
    )

    # Connect the extraction function
    extract_btn.click(
        fn=process_extraction,
        inputs=[file_input, url_input, template_input],
        outputs=output_json,
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()