structured-docling / app_hf_spaces.py
flozi00's picture
ttf
1ce038d
import json
import gradio as gr
import spaces # Hugging Face Spaces Zero GPU support
from docling.datamodel.base_models import InputFormat
from docling.document_extractor import DocumentExtractor
# Initialize the extractor (will be moved to GPU when decorated function is called)
def get_extractor():
"""Initialize extractor - called within GPU context"""
return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
@spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds
def process_extraction(file_input, url_input, template_json):
"""
Process document extraction with the provided template.
Uses Hugging Face Spaces Zero GPU feature.
Args:
file_input: Uploaded file (PDF or image)
url_input: URL to a document
template_json: JSON string defining the extraction template
Returns:
JSON string with extracted data
"""
try:
# Initialize extractor in GPU context
extractor = get_extractor()
# Determine the source
source = None
if file_input is not None:
source = file_input.name
elif url_input and url_input.strip():
source = url_input.strip()
else:
return json.dumps(
{"error": "Please provide either a file or a URL"}, indent=2
)
# Parse the template JSON
try:
template = json.loads(template_json)
except json.JSONDecodeError as e:
return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)
# Perform extraction
result = extractor.extract(
source=source,
template=template,
)
# Format the output
output = {"pages": []}
for page in result.pages:
page_data = {
"page_no": page.page_no,
"extracted_data": page.extracted_data,
"raw_text": page.raw_text,
"errors": page.errors if page.errors else [],
}
output["pages"].append(page_data)
return json.dumps(output, indent=2)
except Exception as e:
return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)
# Default template example
default_template = json.dumps(
{"bill_no": "string", "total": "float", "date": "string"}, indent=2
)
# Create Gradio interface
with gr.Blocks(title="Docling Structured Extraction") as demo:
gr.Markdown(
"""
# πŸ“„ Docling Structured Extraction Demo
Extract structured data from documents (PDF/Images) using AI-powered extraction.
**Note:** This feature is currently in beta.
### How to use:
1. Upload a file OR provide a URL to a document
2. Define your extraction template in JSON format
3. Click "Extract" to get structured data
πŸš€ **Powered by Hugging Face Spaces Zero GPU**
"""
)
with gr.Row():
with gr.Column():
gr.Markdown("### Input Source")
file_input = gr.File(
label="Upload File (PDF or Image)",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
)
url_input = gr.Textbox(
label="Or Enter Document URL",
placeholder="https://example.com/document.pdf",
lines=1,
)
gr.Markdown("### Extraction Template")
gr.Markdown(
"""
Define the structure of data you want to extract. Use JSON format with field names and types:
- `"string"` for text fields
- `"float"` for numbers with decimals
- `"int"` for whole numbers
"""
)
template_input = gr.Code(
label="JSON Template", value=default_template, language="json", lines=15
)
extract_btn = gr.Button("Extract", variant="primary", size="lg")
with gr.Column():
gr.Markdown("### Extracted Data")
output_json = gr.Code(label="Result (JSON)", language="json", lines=25)
# Examples section
gr.Markdown("### Examples")
gr.Examples(
examples=[
[
None,
"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
json.dumps({"bill_no": "string", "total": "float"}, indent=2),
],
[
None,
"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
json.dumps(
{
"bill_no": "string",
"total": "float",
"sender_name": "string",
"receiver_name": "string",
"postal_code": "string",
},
indent=2,
),
],
],
inputs=[file_input, url_input, template_input],
label="Try these examples",
)
# Connect the extraction function
extract_btn.click(
fn=process_extraction,
inputs=[file_input, url_input, template_input],
outputs=output_json,
)
# Launch the app
if __name__ == "__main__":
demo.launch()