flozi00 commited on
Commit
eaafbab
·
1 Parent(s): e8d8985

Refactor app structure: update main app file reference and consolidate extraction logic into app.py

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +59 -6
  3. app_hf_spaces.py +0 -166
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
- app_file: app_hf_spaces.py
9
  pinned: false
10
  license: gpl-3.0
11
  ---
 
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
+ app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
11
  ---
app.py CHANGED
@@ -1,16 +1,47 @@
1
  import json
2
 
3
  import gradio as gr
 
4
  from docling.datamodel.base_models import InputFormat
 
 
 
 
 
5
  from docling.document_extractor import DocumentExtractor
6
 
7
- # Initialize the extractor
8
- extractor = DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
 
11
  def process_extraction(file_input, url_input, template_json):
12
  """
13
  Process document extraction with the provided template.
 
14
 
15
  Args:
16
  file_input: Uploaded file (PDF or image)
@@ -32,6 +63,18 @@ def process_extraction(file_input, url_input, template_json):
32
  {"error": "Please provide either a file or a URL"}, indent=2
33
  )
34
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  # Parse the template JSON
36
  try:
37
  template = json.loads(template_json)
@@ -79,8 +122,10 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
79
 
80
  ### How to use:
81
  1. Upload a file OR provide a URL to a document
82
- 2. Define your extraction template in JSON format
83
- 3. Click "Extract" to get structured data
 
 
84
  """
85
  )
86
 
@@ -88,8 +133,7 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
88
  with gr.Column():
89
  gr.Markdown("### Input Source")
90
  file_input = gr.File(
91
- label="Upload File (PDF or Image)",
92
- file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
93
  )
94
  url_input = gr.Textbox(
95
  label="Or Enter Document URL",
@@ -98,6 +142,14 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
98
  )
99
 
100
  gr.Markdown("### Extraction Template")
 
 
 
 
 
 
 
 
101
  template_input = gr.Code(
102
  label="JSON Template", value=default_template, language="json", lines=15
103
  )
@@ -126,6 +178,7 @@ with gr.Blocks(title="Docling Structured Extraction") as demo:
126
  "total": "float",
127
  "sender_name": "string",
128
  "receiver_name": "string",
 
129
  },
130
  indent=2,
131
  ),
 
1
  import json
2
 
3
  import gradio as gr
4
+ import spaces # Hugging Face Spaces Zero GPU support
5
  from docling.datamodel.base_models import InputFormat
6
+ from docling.datamodel.pipeline_options import (
7
+ PdfPipelineOptions,
8
+ granite_picture_description,
9
+ )
10
+ from docling.document_converter import DocumentConverter, PdfFormatOption
11
  from docling.document_extractor import DocumentExtractor
12
 
13
+
14
+ # Initialize the extractor (will be moved to GPU when decorated function is called)
15
+ def get_extractor():
16
+ """Initialize extractor - called within GPU context"""
17
+ return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
18
+
19
+
20
+ def get_converter_with_vision():
21
+ """Initialize converter with vision - called within GPU context"""
22
+ pipeline_options = PdfPipelineOptions()
23
+ pipeline_options.do_picture_description = True
24
+ pipeline_options.picture_description_options = granite_picture_description
25
+ pipeline_options.picture_description_options.prompt = (
26
+ "Describe the image in as much detail as possible."
27
+ )
28
+ pipeline_options.images_scale = 2.0
29
+ pipeline_options.generate_picture_images = True
30
+
31
+ return DocumentConverter(
32
+ format_options={
33
+ InputFormat.PDF: PdfFormatOption(
34
+ pipeline_options=pipeline_options,
35
+ )
36
+ }
37
+ )
38
 
39
 
40
+ @spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds
41
  def process_extraction(file_input, url_input, template_json):
42
  """
43
  Process document extraction with the provided template.
44
+ Uses Hugging Face Spaces Zero GPU feature.
45
 
46
  Args:
47
  file_input: Uploaded file (PDF or image)
 
63
  {"error": "Please provide either a file or a URL"}, indent=2
64
  )
65
 
66
+ # If no template is provided, use the converter with vision
67
+ if not template_json or not template_json.strip():
68
+ converter = get_converter_with_vision()
69
+ try:
70
+ result = converter.convert(source)
71
+ return json.dumps(result.document.export_to_dict(), indent=2)
72
+ except Exception as e:
73
+ return json.dumps({"error": f"Conversion failed: {str(e)}"}, indent=2)
74
+
75
+ # Initialize extractor in GPU context
76
+ extractor = get_extractor()
77
+
78
  # Parse the template JSON
79
  try:
80
  template = json.loads(template_json)
 
122
 
123
  ### How to use:
124
  1. Upload a file OR provide a URL to a document
125
+ 2. Define your extraction template in JSON format (or leave empty for full document conversion with picture descriptions)
126
+ 3. Click "Extract" to get structured data or full document JSON
127
+
128
+ 🚀 **Powered by Hugging Face Spaces Zero GPU**
129
  """
130
  )
131
 
 
133
  with gr.Column():
134
  gr.Markdown("### Input Source")
135
  file_input = gr.File(
136
+ label="Upload File (PDF or Image)"
 
137
  )
138
  url_input = gr.Textbox(
139
  label="Or Enter Document URL",
 
142
  )
143
 
144
  gr.Markdown("### Extraction Template")
145
+ gr.Markdown(
146
+ """
147
+ Define the structure of data you want to extract. Use JSON format with field names and types:
148
+ - `"string"` for text fields
149
+ - `"float"` for numbers with decimals
150
+ - `"int"` for whole numbers
151
+ """
152
+ )
153
  template_input = gr.Code(
154
  label="JSON Template", value=default_template, language="json", lines=15
155
  )
 
178
  "total": "float",
179
  "sender_name": "string",
180
  "receiver_name": "string",
181
+ "postal_code": "string",
182
  },
183
  indent=2,
184
  ),
app_hf_spaces.py DELETED
@@ -1,166 +0,0 @@
1
- import json
2
-
3
- import gradio as gr
4
- import spaces # Hugging Face Spaces Zero GPU support
5
- from docling.datamodel.base_models import InputFormat
6
- from docling.document_extractor import DocumentExtractor
7
-
8
-
9
- # Initialize the extractor (will be moved to GPU when decorated function is called)
10
- def get_extractor():
11
- """Initialize extractor - called within GPU context"""
12
- return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])
13
-
14
-
15
- @spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds
16
- def process_extraction(file_input, url_input, template_json):
17
- """
18
- Process document extraction with the provided template.
19
- Uses Hugging Face Spaces Zero GPU feature.
20
-
21
- Args:
22
- file_input: Uploaded file (PDF or image)
23
- url_input: URL to a document
24
- template_json: JSON string defining the extraction template
25
-
26
- Returns:
27
- JSON string with extracted data
28
- """
29
- try:
30
- # Initialize extractor in GPU context
31
- extractor = get_extractor()
32
-
33
- # Determine the source
34
- source = None
35
- if file_input is not None:
36
- source = file_input.name
37
- elif url_input and url_input.strip():
38
- source = url_input.strip()
39
- else:
40
- return json.dumps(
41
- {"error": "Please provide either a file or a URL"}, indent=2
42
- )
43
-
44
- # Parse the template JSON
45
- try:
46
- template = json.loads(template_json)
47
- except json.JSONDecodeError as e:
48
- return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)
49
-
50
- # Perform extraction
51
- result = extractor.extract(
52
- source=source,
53
- template=template,
54
- )
55
-
56
- # Format the output
57
- output = {"pages": []}
58
-
59
- for page in result.pages:
60
- page_data = {
61
- "page_no": page.page_no,
62
- "extracted_data": page.extracted_data,
63
- "raw_text": page.raw_text,
64
- "errors": page.errors if page.errors else [],
65
- }
66
- output["pages"].append(page_data)
67
-
68
- return json.dumps(output, indent=2)
69
-
70
- except Exception as e:
71
- return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)
72
-
73
-
74
- # Default template example
75
- default_template = json.dumps(
76
- {"bill_no": "string", "total": "float", "date": "string"}, indent=2
77
- )
78
-
79
- # Create Gradio interface
80
- with gr.Blocks(title="Docling Structured Extraction") as demo:
81
- gr.Markdown(
82
- """
83
- # 📄 Docling Structured Extraction Demo
84
-
85
- Extract structured data from documents (PDF/Images) using AI-powered extraction.
86
-
87
- **Note:** This feature is currently in beta.
88
-
89
- ### How to use:
90
- 1. Upload a file OR provide a URL to a document
91
- 2. Define your extraction template in JSON format
92
- 3. Click "Extract" to get structured data
93
-
94
- 🚀 **Powered by Hugging Face Spaces Zero GPU**
95
- """
96
- )
97
-
98
- with gr.Row():
99
- with gr.Column():
100
- gr.Markdown("### Input Source")
101
- file_input = gr.File(
102
- label="Upload File (PDF or Image)"
103
- )
104
- url_input = gr.Textbox(
105
- label="Or Enter Document URL",
106
- placeholder="https://example.com/document.pdf",
107
- lines=1,
108
- )
109
-
110
- gr.Markdown("### Extraction Template")
111
- gr.Markdown(
112
- """
113
- Define the structure of data you want to extract. Use JSON format with field names and types:
114
- - `"string"` for text fields
115
- - `"float"` for numbers with decimals
116
- - `"int"` for whole numbers
117
- """
118
- )
119
- template_input = gr.Code(
120
- label="JSON Template", value=default_template, language="json", lines=15
121
- )
122
-
123
- extract_btn = gr.Button("Extract", variant="primary", size="lg")
124
-
125
- with gr.Column():
126
- gr.Markdown("### Extracted Data")
127
- output_json = gr.Code(label="Result (JSON)", language="json", lines=25)
128
-
129
- # Examples section
130
- gr.Markdown("### Examples")
131
- gr.Examples(
132
- examples=[
133
- [
134
- None,
135
- "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
136
- json.dumps({"bill_no": "string", "total": "float"}, indent=2),
137
- ],
138
- [
139
- None,
140
- "https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
141
- json.dumps(
142
- {
143
- "bill_no": "string",
144
- "total": "float",
145
- "sender_name": "string",
146
- "receiver_name": "string",
147
- "postal_code": "string",
148
- },
149
- indent=2,
150
- ),
151
- ],
152
- ],
153
- inputs=[file_input, url_input, template_input],
154
- label="Try these examples",
155
- )
156
-
157
- # Connect the extraction function
158
- extract_btn.click(
159
- fn=process_extraction,
160
- inputs=[file_input, url_input, template_input],
161
- outputs=output_json,
162
- )
163
-
164
- # Launch the app
165
- if __name__ == "__main__":
166
- demo.launch()