import json import gradio as gr import spaces # Hugging Face Spaces Zero GPU support from docling.datamodel.base_models import InputFormat from docling.document_extractor import DocumentExtractor # Initialize the extractor (will be moved to GPU when decorated function is called) def get_extractor(): """Initialize extractor - called within GPU context""" return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF]) @spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds def process_extraction(file_input, url_input, template_json): """ Process document extraction with the provided template. Uses Hugging Face Spaces Zero GPU feature. Args: file_input: Uploaded file (PDF or image) url_input: URL to a document template_json: JSON string defining the extraction template Returns: JSON string with extracted data """ try: # Initialize extractor in GPU context extractor = get_extractor() # Determine the source source = None if file_input is not None: source = file_input.name elif url_input and url_input.strip(): source = url_input.strip() else: return json.dumps( {"error": "Please provide either a file or a URL"}, indent=2 ) # Parse the template JSON try: template = json.loads(template_json) except json.JSONDecodeError as e: return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2) # Perform extraction result = extractor.extract( source=source, template=template, ) # Format the output output = {"pages": []} for page in result.pages: page_data = { "page_no": page.page_no, "extracted_data": page.extracted_data, "raw_text": page.raw_text, "errors": page.errors if page.errors else [], } output["pages"].append(page_data) return json.dumps(output, indent=2) except Exception as e: return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2) # Default template example default_template = json.dumps( {"bill_no": "string", "total": "float", "date": "string"}, indent=2 ) # Create Gradio interface with gr.Blocks(title="Docling Structured Extraction") as demo: gr.Markdown( """ # 📄 Docling Structured Extraction Demo Extract structured data from documents (PDF/Images) using AI-powered extraction. **Note:** This feature is currently in beta. ### How to use: 1. Upload a file OR provide a URL to a document 2. Define your extraction template in JSON format 3. Click "Extract" to get structured data 🚀 **Powered by Hugging Face Spaces Zero GPU** """ ) with gr.Row(): with gr.Column(): gr.Markdown("### Input Source") file_input = gr.File( label="Upload File (PDF or Image)", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"], ) url_input = gr.Textbox( label="Or Enter Document URL", placeholder="https://example.com/document.pdf", lines=1, ) gr.Markdown("### Extraction Template") gr.Markdown( """ Define the structure of data you want to extract. Use JSON format with field names and types: - `"string"` for text fields - `"float"` for numbers with decimals - `"int"` for whole numbers """ ) template_input = gr.Code( label="JSON Template", value=default_template, language="json", lines=15 ) extract_btn = gr.Button("Extract", variant="primary", size="lg") with gr.Column(): gr.Markdown("### Extracted Data") output_json = gr.Code(label="Result (JSON)", language="json", lines=25) # Examples section gr.Markdown("### Examples") gr.Examples( examples=[ [ None, "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg", json.dumps({"bill_no": "string", "total": "float"}, indent=2), ], [ None, "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg", json.dumps( { "bill_no": "string", "total": "float", "sender_name": "string", "receiver_name": "string", "postal_code": "string", }, indent=2, ), ], ], inputs=[file_input, url_input, template_input], label="Try these examples", ) # Connect the extraction function extract_btn.click( fn=process_extraction, inputs=[file_input, url_input, template_input], outputs=output_json, ) # Launch the app if __name__ == "__main__": demo.launch()