Spaces:

flozi00
/

structured-docling

Running on Zero

App Files Files Community

flozi00 commited on Dec 24, 2025

Commit

eaafbab

1 Parent(s): e8d8985

Refactor app structure: update main app file reference and consolidate extraction logic into app.py

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +59 -6
app_hf_spaces.py +0 -166

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: blue
 colorTo: indigo
 sdk: gradio
 sdk_version: 5.49.1
-app_file: app_hf_spaces.py
 pinned: false
 license: gpl-3.0
 ---

 colorTo: indigo
 sdk: gradio
 sdk_version: 5.49.1
+app_file: app.py
 pinned: false
 license: gpl-3.0
 ---

app.py CHANGED Viewed

@@ -1,16 +1,47 @@
 import json
 import gradio as gr
 from docling.datamodel.base_models import InputFormat
 from docling.document_extractor import DocumentExtractor
-# Initialize the extractor
-extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
 def process_extraction(file_input, url_input, template_json):
     """
     Process document extraction with the provided template.
     Args:
         file_input: Uploaded file (PDF or image)
@@ -32,6 +63,18 @@ def process_extraction(file_input, url_input, template_json):
                 {"error": "Please provide either a file or a URL"}, indent=2
             )
         # Parse the template JSON
         try:
             template = json.loads(template_json)
@@ -79,8 +122,10 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
     ### How to use:
     1. Upload a file OR provide a URL to a document
-    2. Define your extraction template in JSON format
-    3. Click "Extract" to get structured data
     """
     )
@@ -88,8 +133,7 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
         with gr.Column():
             gr.Markdown("### Input Source")
             file_input = gr.File(
-                label="Upload File (PDF or Image)",
-                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
             )
             url_input = gr.Textbox(
                 label="Or Enter Document URL",
@@ -98,6 +142,14 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
             )
             gr.Markdown("### Extraction Template")
             template_input = gr.Code(
                 label="JSON Template", value=default_template, language="json", lines=15
             )
@@ -126,6 +178,7 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
                         "total": "float",
                         "sender_name": "string",
                         "receiver_name": "string",
                     },
                     indent=2,
                 ),

 import json
 import gradio as gr
+import spaces  # Hugging Face Spaces Zero GPU support
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    granite_picture_description,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.document_extractor import DocumentExtractor
+# Initialize the extractor (will be moved to GPU when decorated function is called)
+def get_extractor():
+    """Initialize extractor - called within GPU context"""
+    return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
+def get_converter_with_vision():
+    """Initialize converter with vision - called within GPU context"""
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_picture_description = True
+    pipeline_options.picture_description_options = granite_picture_description
+    pipeline_options.picture_description_options.prompt = (
+        "Describe the image in as much detail as possible."
+    )
+    pipeline_options.images_scale = 2.0
+    pipeline_options.generate_picture_images = True
+    return DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+@spaces.GPU(duration=60)  # Allocate GPU for up to 60 seconds
 def process_extraction(file_input, url_input, template_json):
     """
     Process document extraction with the provided template.
+    Uses Hugging Face Spaces Zero GPU feature.
     Args:
         file_input: Uploaded file (PDF or image)
                 {"error": "Please provide either a file or a URL"}, indent=2
             )
+        # If no template is provided, use the converter with vision
+        if not template_json or not template_json.strip():
+            converter = get_converter_with_vision()
+            try:
+                result = converter.convert(source)
+                return json.dumps(result.document.export_to_dict(), indent=2)
+            except Exception as e:
+                return json.dumps({"error": f"Conversion failed: {str(e)}"}, indent=2)
+        # Initialize extractor in GPU context
+        extractor = get_extractor()
         # Parse the template JSON
         try:
             template = json.loads(template_json)
     ### How to use:
     1. Upload a file OR provide a URL to a document
+    2. Define your extraction template in JSON format (or leave empty for full document conversion with picture descriptions)
+    3. Click "Extract" to get structured data or full document JSON
+    🚀 **Powered by Hugging Face Spaces Zero GPU**
     """
     )
         with gr.Column():
             gr.Markdown("### Input Source")
             file_input = gr.File(
+                label="Upload File (PDF or Image)"
             )
             url_input = gr.Textbox(
                 label="Or Enter Document URL",
             )
             gr.Markdown("### Extraction Template")
+            gr.Markdown(
+                """
+            Define the structure of data you want to extract. Use JSON format with field names and types:
+            - `"string"` for text fields
+            - `"float"` for numbers with decimals
+            - `"int"` for whole numbers
+            """
+            )
             template_input = gr.Code(
                 label="JSON Template", value=default_template, language="json", lines=15
             )
                         "total": "float",
                         "sender_name": "string",
                         "receiver_name": "string",
+                        "postal_code": "string",
                     },
                     indent=2,
                 ),

app_hf_spaces.py DELETED Viewed

@@ -1,166 +0,0 @@
-import json
-import gradio as gr
-import spaces  # Hugging Face Spaces Zero GPU support
-from docling.datamodel.base_models import InputFormat
-from docling.document_extractor import DocumentExtractor
-# Initialize the extractor (will be moved to GPU when decorated function is called)
-def get_extractor():
-    """Initialize extractor - called within GPU context"""
-    return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
-@spaces.GPU(duration=60)  # Allocate GPU for up to 60 seconds
-def process_extraction(file_input, url_input, template_json):
-    """
-    Process document extraction with the provided template.
-    Uses Hugging Face Spaces Zero GPU feature.
-    Args:
-        file_input: Uploaded file (PDF or image)
-        url_input: URL to a document
-        template_json: JSON string defining the extraction template
-    Returns:
-        JSON string with extracted data
-    """
-    try:
-        # Initialize extractor in GPU context
-        extractor = get_extractor()
-        # Determine the source
-        source = None
-        if file_input is not None:
-            source = file_input.name
-        elif url_input and url_input.strip():
-            source = url_input.strip()
-        else:
-            return json.dumps(
-                {"error": "Please provide either a file or a URL"}, indent=2
-            )
-        # Parse the template JSON
-        try:
-            template = json.loads(template_json)
-        except json.JSONDecodeError as e:
-            return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)
-        # Perform extraction
-        result = extractor.extract(
-            source=source,
-            template=template,
-        )
-        # Format the output
-        output = {"pages": []}
-        for page in result.pages:
-            page_data = {
-                "page_no": page.page_no,
-                "extracted_data": page.extracted_data,
-                "raw_text": page.raw_text,
-                "errors": page.errors if page.errors else [],
-            }
-            output["pages"].append(page_data)
-        return json.dumps(output, indent=2)
-    except Exception as e:
-        return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)
-# Default template example
-default_template = json.dumps(
-    {"bill_no": "string", "total": "float", "date": "string"}, indent=2
-)
-# Create Gradio interface
-with gr.Blocks(title="Docling Structured Extraction") as demo:
-    gr.Markdown(
-        """
-    # 📄 Docling Structured Extraction Demo
-    Extract structured data from documents (PDF/Images) using AI-powered extraction.
-    **Note:** This feature is currently in beta.
-    ### How to use:
-    1. Upload a file OR provide a URL to a document
-    2. Define your extraction template in JSON format
-    3. Click "Extract" to get structured data
-    🚀 **Powered by Hugging Face Spaces Zero GPU**
-    """
-    )
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Input Source")
-            file_input = gr.File(
-                label="Upload File (PDF or Image)"
-            )
-            url_input = gr.Textbox(
-                label="Or Enter Document URL",
-                placeholder="https://example.com/document.pdf",
-                lines=1,
-            )
-            gr.Markdown("### Extraction Template")
-            gr.Markdown(
-                """
-            Define the structure of data you want to extract. Use JSON format with field names and types:
-            - `"string"` for text fields
-            - `"float"` for numbers with decimals
-            - `"int"` for whole numbers
-            """
-            )
-            template_input = gr.Code(
-                label="JSON Template", value=default_template, language="json", lines=15
-            )
-            extract_btn = gr.Button("Extract", variant="primary", size="lg")
-        with gr.Column():
-            gr.Markdown("### Extracted Data")
-            output_json = gr.Code(label="Result (JSON)", language="json", lines=25)
-    # Examples section
-    gr.Markdown("### Examples")
-    gr.Examples(
-        examples=[
-            [
-                None,
-                "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
-                json.dumps({"bill_no": "string", "total": "float"}, indent=2),
-            ],
-            [
-                None,
-                "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
-                json.dumps(
-                    {
-                        "bill_no": "string",
-                        "total": "float",
-                        "sender_name": "string",
-                        "receiver_name": "string",
-                        "postal_code": "string",
-                    },
-                    indent=2,
-                ),
-            ],
-        ],
-        inputs=[file_input, url_input, template_input],
-        label="Try these examples",
-    )
-    # Connect the extraction function
-    extract_btn.click(
-        fn=process_extraction,
-        inputs=[file_input, url_input, template_input],
-        outputs=output_json,
-    )
-# Launch the app
-if __name__ == "__main__":
-    demo.launch()