Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,513 Bytes
8e3d376 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import json
import gradio as gr
from docling.datamodel.base_models import InputFormat
from docling.document_extractor import DocumentExtractor
# Initialize the extractor
extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
def process_extraction(file_input, url_input, template_json):
"""
Process document extraction with the provided template.
Args:
file_input: Uploaded file (PDF or image)
url_input: URL to a document
template_json: JSON string defining the extraction template
Returns:
JSON string with extracted data
"""
try:
# Determine the source
source = None
if file_input is not None:
source = file_input.name
elif url_input and url_input.strip():
source = url_input.strip()
else:
return json.dumps(
{"error": "Please provide either a file or a URL"}, indent=2
)
# Parse the template JSON
try:
template = json.loads(template_json)
except json.JSONDecodeError as e:
return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)
# Perform extraction
result = extractor.extract(
source=source,
template=template,
)
# Format the output
output = {"pages": []}
for page in result.pages:
page_data = {
"page_no": page.page_no,
"extracted_data": page.extracted_data,
"raw_text": page.raw_text,
"errors": page.errors if page.errors else [],
}
output["pages"].append(page_data)
return json.dumps(output, indent=2)
except Exception as e:
return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)
# Default template example
default_template = json.dumps(
{"bill_no": "string", "total": "float", "date": "string"}, indent=2
)
# Create Gradio interface
with gr.Blocks(title="Docling Structured Extraction") as demo:
gr.Markdown(
"""
# π Docling Structured Extraction Demo
Extract structured data from documents (PDF/Images) using AI-powered extraction.
**Note:** This feature is currently in beta.
### How to use:
1. Upload a file OR provide a URL to a document
2. Define your extraction template in JSON format
3. Click "Extract" to get structured data
"""
)
with gr.Row():
with gr.Column():
gr.Markdown("### Input Source")
file_input = gr.File(
label="Upload File (PDF or Image)",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
)
url_input = gr.Textbox(
label="Or Enter Document URL",
placeholder="https://example.com/document.pdf",
lines=1,
)
gr.Markdown("### Extraction Template")
template_input = gr.Code(
label="JSON Template", value=default_template, language="json", lines=15
)
extract_btn = gr.Button("Extract", variant="primary", size="lg")
with gr.Column():
gr.Markdown("### Extracted Data")
output_json = gr.Code(label="Result (JSON)", language="json", lines=25)
# Examples section
gr.Markdown("### Examples")
gr.Examples(
examples=[
[
None,
"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
json.dumps({"bill_no": "string", "total": "float"}, indent=2),
],
[
None,
"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
json.dumps(
{
"bill_no": "string",
"total": "float",
"sender_name": "string",
"receiver_name": "string",
},
indent=2,
),
],
],
inputs=[file_input, url_input, template_input],
label="Try these examples",
)
# Connect the extraction function
extract_btn.click(
fn=process_extraction,
inputs=[file_input, url_input, template_input],
outputs=output_json,
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|