Spaces:
Running
on
Zero
Running
on
Zero
| import json | |
| import gradio as gr | |
| import spaces # Hugging Face Spaces Zero GPU support | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.document_extractor import DocumentExtractor | |
| # Initialize the extractor (will be moved to GPU when decorated function is called) | |
| def get_extractor(): | |
| """Initialize extractor - called within GPU context""" | |
| return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF]) | |
| # Allocate GPU for up to 60 seconds | |
| def process_extraction(file_input, url_input, template_json): | |
| """ | |
| Process document extraction with the provided template. | |
| Uses Hugging Face Spaces Zero GPU feature. | |
| Args: | |
| file_input: Uploaded file (PDF or image) | |
| url_input: URL to a document | |
| template_json: JSON string defining the extraction template | |
| Returns: | |
| JSON string with extracted data | |
| """ | |
| try: | |
| # Initialize extractor in GPU context | |
| extractor = get_extractor() | |
| # Determine the source | |
| source = None | |
| if file_input is not None: | |
| source = file_input.name | |
| elif url_input and url_input.strip(): | |
| source = url_input.strip() | |
| else: | |
| return json.dumps( | |
| {"error": "Please provide either a file or a URL"}, indent=2 | |
| ) | |
| # Parse the template JSON | |
| try: | |
| template = json.loads(template_json) | |
| except json.JSONDecodeError as e: | |
| return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2) | |
| # Perform extraction | |
| result = extractor.extract( | |
| source=source, | |
| template=template, | |
| ) | |
| # Format the output | |
| output = {"pages": []} | |
| for page in result.pages: | |
| page_data = { | |
| "page_no": page.page_no, | |
| "extracted_data": page.extracted_data, | |
| "raw_text": page.raw_text, | |
| "errors": page.errors if page.errors else [], | |
| } | |
| output["pages"].append(page_data) | |
| return json.dumps(output, indent=2) | |
| except Exception as e: | |
| return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2) | |
| # Default template example | |
| default_template = json.dumps( | |
| {"bill_no": "string", "total": "float", "date": "string"}, indent=2 | |
| ) | |
| # Create Gradio interface | |
| with gr.Blocks(title="Docling Structured Extraction") as demo: | |
| gr.Markdown( | |
| """ | |
| # π Docling Structured Extraction Demo | |
| Extract structured data from documents (PDF/Images) using AI-powered extraction. | |
| **Note:** This feature is currently in beta. | |
| ### How to use: | |
| 1. Upload a file OR provide a URL to a document | |
| 2. Define your extraction template in JSON format | |
| 3. Click "Extract" to get structured data | |
| π **Powered by Hugging Face Spaces Zero GPU** | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Input Source") | |
| file_input = gr.File( | |
| label="Upload File (PDF or Image)", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"], | |
| ) | |
| url_input = gr.Textbox( | |
| label="Or Enter Document URL", | |
| placeholder="https://example.com/document.pdf", | |
| lines=1, | |
| ) | |
| gr.Markdown("### Extraction Template") | |
| gr.Markdown( | |
| """ | |
| Define the structure of data you want to extract. Use JSON format with field names and types: | |
| - `"string"` for text fields | |
| - `"float"` for numbers with decimals | |
| - `"int"` for whole numbers | |
| """ | |
| ) | |
| template_input = gr.Code( | |
| label="JSON Template", value=default_template, language="json", lines=15 | |
| ) | |
| extract_btn = gr.Button("Extract", variant="primary", size="lg") | |
| with gr.Column(): | |
| gr.Markdown("### Extracted Data") | |
| output_json = gr.Code(label="Result (JSON)", language="json", lines=25) | |
| # Examples section | |
| gr.Markdown("### Examples") | |
| gr.Examples( | |
| examples=[ | |
| [ | |
| None, | |
| "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg", | |
| json.dumps({"bill_no": "string", "total": "float"}, indent=2), | |
| ], | |
| [ | |
| None, | |
| "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg", | |
| json.dumps( | |
| { | |
| "bill_no": "string", | |
| "total": "float", | |
| "sender_name": "string", | |
| "receiver_name": "string", | |
| "postal_code": "string", | |
| }, | |
| indent=2, | |
| ), | |
| ], | |
| ], | |
| inputs=[file_input, url_input, template_input], | |
| label="Try these examples", | |
| ) | |
| # Connect the extraction function | |
| extract_btn.click( | |
| fn=process_extraction, | |
| inputs=[file_input, url_input, template_input], | |
| outputs=output_json, | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() | |