flozi00 commited on
Commit
8e3d376
Β·
1 Parent(s): 62cc451
Files changed (5) hide show
  1. .gitignore +46 -0
  2. README.md +97 -5
  3. app.py +148 -0
  4. app_hf_spaces.py +169 -0
  5. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ env/
26
+ ENV/
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+ *.swp
32
+ *.swo
33
+ *~
34
+
35
+ # Gradio
36
+ gradio_cached_examples/
37
+ flagged/
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
42
+
43
+ # Temporary files
44
+ *.tmp
45
+ temp/
46
+ tmp/
README.md CHANGED
@@ -1,13 +1,105 @@
1
  ---
2
  title: Structured Docling
3
- emoji: πŸ’»
4
- colorFrom: red
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
- app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Structured Docling
3
+ emoji: πŸ“„
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
+ app_file: app_hf_spaces.py
9
  pinned: false
10
  license: gpl-3.0
11
  ---
12
 
13
+ # Docling Structured Extraction Demo
14
+
15
+ A Gradio-based demo application for extracting structured data from documents using Docling's beta structured extraction feature.
16
+
17
+ ## Features
18
+
19
+ - πŸ“„ Support for PDF and image files (PNG, JPG, JPEG, TIFF, BMP)
20
+ - 🌐 URL input for remote documents
21
+ - 🎯 Customizable JSON templates for extraction
22
+ - πŸš€ Optimized for Hugging Face Spaces with Zero GPU support
23
+ - πŸ“Š Clean JSON output with extracted data
24
+
25
+ ## Files
26
+
27
+ - `app.py` - Standard Gradio application
28
+ - `app_hf_spaces.py` - Version optimized for Hugging Face Spaces with Zero GPU decorator
29
+ - `requirements.txt` - Python dependencies
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install -r requirements.txt
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ ### Local Development
40
+
41
+ Run the standard version:
42
+ ```bash
43
+ python app.py
44
+ ```
45
+
46
+ ### Hugging Face Spaces
47
+
48
+ The `app_hf_spaces.py` file is specifically designed for deployment on Hugging Face Spaces with Zero GPU support.
49
+
50
+ To deploy:
51
+ 1. Create a new Space on Hugging Face
52
+ 2. Upload `app_hf_spaces.py` (rename to `app.py`)
53
+ 3. Upload `requirements.txt`
54
+ 4. Enable Zero GPU in Space settings
55
+
56
+ ## How to Use the Demo
57
+
58
+ 1. **Input Source**: Either upload a document file or provide a URL to a document
59
+ 2. **Define Template**: Create a JSON template specifying the fields you want to extract
60
+ - Use `"string"` for text fields
61
+ - Use `"float"` for decimal numbers
62
+ - Use `"int"` for whole numbers
63
+ 3. **Extract**: Click the "Extract" button to process the document
64
+ 4. **View Results**: The extracted data will appear in JSON format in the output box
65
+
66
+ ## Template Examples
67
+
68
+ ### Simple Invoice Extraction
69
+ ```json
70
+ {
71
+ "bill_no": "string",
72
+ "total": "float",
73
+ "date": "string"
74
+ }
75
+ ```
76
+
77
+ ### Detailed Invoice Extraction
78
+ ```json
79
+ {
80
+ "bill_no": "string",
81
+ "total": "float",
82
+ "sender_name": "string",
83
+ "receiver_name": "string",
84
+ "postal_code": "string",
85
+ "city": "string"
86
+ }
87
+ ```
88
+
89
+ ## Notes
90
+
91
+ - The structured extraction API is currently in **beta** and may change
92
+ - Only PDF and image formats are supported
93
+ - The extraction uses Vision Language Models (VLM) for understanding document content
94
+ - Processing time depends on document complexity and size
95
+
96
+ ## Requirements
97
+
98
+ - Python 3.9+
99
+ - gradio >= 4.0.0
100
+ - docling[vlm] >= 2.0.0
101
+ - spaces >= 0.19.0 (for Hugging Face Spaces deployment)
102
+
103
+ ## License
104
+
105
+ This demo is provided as-is for demonstration purposes.
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ from docling.datamodel.base_models import InputFormat
5
+ from docling.document_extractor import DocumentExtractor
6
+
7
+ # Initialize the extractor
8
+ extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
9
+
10
+
11
+ def process_extraction(file_input, url_input, template_json):
12
+ """
13
+ Process document extraction with the provided template.
14
+
15
+ Args:
16
+ file_input: Uploaded file (PDF or image)
17
+ url_input: URL to a document
18
+ template_json: JSON string defining the extraction template
19
+
20
+ Returns:
21
+ JSON string with extracted data
22
+ """
23
+ try:
24
+ # Determine the source
25
+ source = None
26
+ if file_input is not None:
27
+ source = file_input.name
28
+ elif url_input and url_input.strip():
29
+ source = url_input.strip()
30
+ else:
31
+ return json.dumps(
32
+ {"error": "Please provide either a file or a URL"}, indent=2
33
+ )
34
+
35
+ # Parse the template JSON
36
+ try:
37
+ template = json.loads(template_json)
38
+ except json.JSONDecodeError as e:
39
+ return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)
40
+
41
+ # Perform extraction
42
+ result = extractor.extract(
43
+ source=source,
44
+ template=template,
45
+ )
46
+
47
+ # Format the output
48
+ output = {"pages": []}
49
+
50
+ for page in result.pages:
51
+ page_data = {
52
+ "page_no": page.page_no,
53
+ "extracted_data": page.extracted_data,
54
+ "raw_text": page.raw_text,
55
+ "errors": page.errors if page.errors else [],
56
+ }
57
+ output["pages"].append(page_data)
58
+
59
+ return json.dumps(output, indent=2)
60
+
61
+ except Exception as e:
62
+ return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)
63
+
64
+
65
+ # Default template example
66
+ default_template = json.dumps(
67
+ {"bill_no": "string", "total": "float", "date": "string"}, indent=2
68
+ )
69
+
70
+ # Create Gradio interface
71
+ with gr.Blocks(title="Docling Structured Extraction") as demo:
72
+ gr.Markdown(
73
+ """
74
+ # πŸ“„ Docling Structured Extraction Demo
75
+
76
+ Extract structured data from documents (PDF/Images) using AI-powered extraction.
77
+
78
+ **Note:** This feature is currently in beta.
79
+
80
+ ### How to use:
81
+ 1. Upload a file OR provide a URL to a document
82
+ 2. Define your extraction template in JSON format
83
+ 3. Click "Extract" to get structured data
84
+ """
85
+ )
86
+
87
+ with gr.Row():
88
+ with gr.Column():
89
+ gr.Markdown("### Input Source")
90
+ file_input = gr.File(
91
+ label="Upload File (PDF or Image)",
92
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
93
+ )
94
+ url_input = gr.Textbox(
95
+ label="Or Enter Document URL",
96
+ placeholder="https://example.com/document.pdf",
97
+ lines=1,
98
+ )
99
+
100
+ gr.Markdown("### Extraction Template")
101
+ template_input = gr.Code(
102
+ label="JSON Template", value=default_template, language="json", lines=15
103
+ )
104
+
105
+ extract_btn = gr.Button("Extract", variant="primary", size="lg")
106
+
107
+ with gr.Column():
108
+ gr.Markdown("### Extracted Data")
109
+ output_json = gr.Code(label="Result (JSON)", language="json", lines=25)
110
+
111
+ # Examples section
112
+ gr.Markdown("### Examples")
113
+ gr.Examples(
114
+ examples=[
115
+ [
116
+ None,
117
+ "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
118
+ json.dumps({"bill_no": "string", "total": "float"}, indent=2),
119
+ ],
120
+ [
121
+ None,
122
+ "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
123
+ json.dumps(
124
+ {
125
+ "bill_no": "string",
126
+ "total": "float",
127
+ "sender_name": "string",
128
+ "receiver_name": "string",
129
+ },
130
+ indent=2,
131
+ ),
132
+ ],
133
+ ],
134
+ inputs=[file_input, url_input, template_input],
135
+ label="Try these examples",
136
+ )
137
+
138
+ # Connect the extraction function
139
+ extract_btn.click(
140
+ fn=process_extraction,
141
+ inputs=[file_input, url_input, template_input],
142
+ outputs=output_json,
143
+ )
144
+
145
+ # Launch the app
146
+ if __name__ == "__main__":
147
+ demo.launch()
148
+ demo.launch()
app_hf_spaces.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ import spaces # Hugging Face Spaces Zero GPU support
5
+ from docling.datamodel.base_models import InputFormat
6
+ from docling.document_extractor import DocumentExtractor
7
+
8
+
9
+ # Initialize the extractor (will be moved to GPU when decorated function is called)
10
+ def get_extractor():
11
+ """Initialize extractor - called within GPU context"""
12
+ return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
13
+
14
+
15
+ @spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds
16
+ def process_extraction(file_input, url_input, template_json):
17
+ """
18
+ Process document extraction with the provided template.
19
+ Uses Hugging Face Spaces Zero GPU feature.
20
+
21
+ Args:
22
+ file_input: Uploaded file (PDF or image)
23
+ url_input: URL to a document
24
+ template_json: JSON string defining the extraction template
25
+
26
+ Returns:
27
+ JSON string with extracted data
28
+ """
29
+ try:
30
+ # Initialize extractor in GPU context
31
+ extractor = get_extractor()
32
+
33
+ # Determine the source
34
+ source = None
35
+ if file_input is not None:
36
+ source = file_input.name
37
+ elif url_input and url_input.strip():
38
+ source = url_input.strip()
39
+ else:
40
+ return json.dumps(
41
+ {"error": "Please provide either a file or a URL"}, indent=2
42
+ )
43
+
44
+ # Parse the template JSON
45
+ try:
46
+ template = json.loads(template_json)
47
+ except json.JSONDecodeError as e:
48
+ return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)
49
+
50
+ # Perform extraction
51
+ result = extractor.extract(
52
+ source=source,
53
+ template=template,
54
+ )
55
+
56
+ # Format the output
57
+ output = {"pages": []}
58
+
59
+ for page in result.pages:
60
+ page_data = {
61
+ "page_no": page.page_no,
62
+ "extracted_data": page.extracted_data,
63
+ "raw_text": page.raw_text,
64
+ "errors": page.errors if page.errors else [],
65
+ }
66
+ output["pages"].append(page_data)
67
+
68
+ return json.dumps(output, indent=2)
69
+
70
+ except Exception as e:
71
+ return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)
72
+
73
+
74
+ # Default template example
75
+ default_template = json.dumps(
76
+ {"bill_no": "string", "total": "float", "date": "string"}, indent=2
77
+ )
78
+
79
+ # Create Gradio interface
80
+ with gr.Blocks(title="Docling Structured Extraction") as demo:
81
+ gr.Markdown(
82
+ """
83
+ # πŸ“„ Docling Structured Extraction Demo
84
+
85
+ Extract structured data from documents (PDF/Images) using AI-powered extraction.
86
+
87
+ **Note:** This feature is currently in beta.
88
+
89
+ ### How to use:
90
+ 1. Upload a file OR provide a URL to a document
91
+ 2. Define your extraction template in JSON format
92
+ 3. Click "Extract" to get structured data
93
+
94
+ πŸš€ **Powered by Hugging Face Spaces Zero GPU**
95
+ """
96
+ )
97
+
98
+ with gr.Row():
99
+ with gr.Column():
100
+ gr.Markdown("### Input Source")
101
+ file_input = gr.File(
102
+ label="Upload File (PDF or Image)",
103
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
104
+ )
105
+ url_input = gr.Textbox(
106
+ label="Or Enter Document URL",
107
+ placeholder="https://example.com/document.pdf",
108
+ lines=1,
109
+ )
110
+
111
+ gr.Markdown("### Extraction Template")
112
+ gr.Markdown(
113
+ """
114
+ Define the structure of data you want to extract. Use JSON format with field names and types:
115
+ - `"string"` for text fields
116
+ - `"float"` for numbers with decimals
117
+ - `"int"` for whole numbers
118
+ """
119
+ )
120
+ template_input = gr.Code(
121
+ label="JSON Template", value=default_template, language="json", lines=15
122
+ )
123
+
124
+ extract_btn = gr.Button("Extract", variant="primary", size="lg")
125
+
126
+ with gr.Column():
127
+ gr.Markdown("### Extracted Data")
128
+ output_json = gr.Code(label="Result (JSON)", language="json", lines=25)
129
+
130
+ # Examples section
131
+ gr.Markdown("### Examples")
132
+ gr.Examples(
133
+ examples=[
134
+ [
135
+ None,
136
+ "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
137
+ json.dumps({"bill_no": "string", "total": "float"}, indent=2),
138
+ ],
139
+ [
140
+ None,
141
+ "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
142
+ json.dumps(
143
+ {
144
+ "bill_no": "string",
145
+ "total": "float",
146
+ "sender_name": "string",
147
+ "receiver_name": "string",
148
+ "postal_code": "string",
149
+ },
150
+ indent=2,
151
+ ),
152
+ ],
153
+ ],
154
+ inputs=[file_input, url_input, template_input],
155
+ label="Try these examples",
156
+ )
157
+
158
+ # Connect the extraction function
159
+ extract_btn.click(
160
+ fn=process_extraction,
161
+ inputs=[file_input, url_input, template_input],
162
+ outputs=output_json,
163
+ )
164
+
165
+ # Launch the app
166
+ if __name__ == "__main__":
167
+ demo.launch()
168
+ if __name__ == "__main__":
169
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.0.0
2
+ docling[vlm]>=2.0.0
3
+ spaces>=0.19.0