Spaces:

flozi00
/

structured-docling

Running on Zero

App Files Files Community

structured-docling / app_hf_spaces.py

flozi00

ttf

1ce038d about 1 month ago

raw

history blame contribute delete

5.32 kB

	import json

	import gradio as gr
	import spaces # Hugging Face Spaces Zero GPU support
	from docling.datamodel.base_models import InputFormat
	from docling.document_extractor import DocumentExtractor


	# Initialize the extractor (will be moved to GPU when decorated function is called)
	def get_extractor():
	"""Initialize extractor - called within GPU context"""
	return DocumentExtractor(allowed_formats=[InputFormat.IMAGE, InputFormat.PDF])


	@spaces.GPU(duration=60) # Allocate GPU for up to 60 seconds
	def process_extraction(file_input, url_input, template_json):
	"""
	Process document extraction with the provided template.
	Uses Hugging Face Spaces Zero GPU feature.

	Args:
	file_input: Uploaded file (PDF or image)
	url_input: URL to a document
	template_json: JSON string defining the extraction template

	Returns:
	JSON string with extracted data
	"""
	try:
	# Initialize extractor in GPU context
	extractor = get_extractor()

	# Determine the source
	source = None
	if file_input is not None:
	source = file_input.name
	elif url_input and url_input.strip():
	source = url_input.strip()
	else:
	return json.dumps(
	{"error": "Please provide either a file or a URL"}, indent=2
	)

	# Parse the template JSON
	try:
	template = json.loads(template_json)
	except json.JSONDecodeError as e:
	return json.dumps({"error": f"Invalid JSON template: {str(e)}"}, indent=2)

	# Perform extraction
	result = extractor.extract(
	source=source,
	template=template,
	)

	# Format the output
	output = {"pages": []}

	for page in result.pages:
	page_data = {
	"page_no": page.page_no,
	"extracted_data": page.extracted_data,
	"raw_text": page.raw_text,
	"errors": page.errors if page.errors else [],
	}
	output["pages"].append(page_data)

	return json.dumps(output, indent=2)

	except Exception as e:
	return json.dumps({"error": f"Extraction failed: {str(e)}"}, indent=2)


	# Default template example
	default_template = json.dumps(
	{"bill_no": "string", "total": "float", "date": "string"}, indent=2
	)

	# Create Gradio interface
	with gr.Blocks(title="Docling Structured Extraction") as demo:
	gr.Markdown(
	"""
	# 📄 Docling Structured Extraction Demo

	Extract structured data from documents (PDF/Images) using AI-powered extraction.

	Note: This feature is currently in beta.

	### How to use:
	1. Upload a file OR provide a URL to a document
	2. Define your extraction template in JSON format
	3. Click "Extract" to get structured data

	🚀 Powered by Hugging Face Spaces Zero GPU
	"""
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Input Source")
	file_input = gr.File(
	label="Upload File (PDF or Image)",
	file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
	)
	url_input = gr.Textbox(
	label="Or Enter Document URL",
	placeholder="https://example.com/document.pdf",
	lines=1,
	)

	gr.Markdown("### Extraction Template")
	gr.Markdown(
	"""
	Define the structure of data you want to extract. Use JSON format with field names and types:
	- `"string"` for text fields
	- `"float"` for numbers with decimals
	- `"int"` for whole numbers
	"""
	)
	template_input = gr.Code(
	label="JSON Template", value=default_template, language="json", lines=15
	)

	extract_btn = gr.Button("Extract", variant="primary", size="lg")

	with gr.Column():
	gr.Markdown("### Extracted Data")
	output_json = gr.Code(label="Result (JSON)", language="json", lines=25)

	# Examples section
	gr.Markdown("### Examples")
	gr.Examples(
	examples=[
	[
	None,
	"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
	json.dumps({"bill_no": "string", "total": "float"}, indent=2),
	],
	[
	None,
	"https://upload.wikimedia.org/wikipedia/commons/9/9f/Swiss_QR-Bill_example.jpg",
	json.dumps(
	{
	"bill_no": "string",
	"total": "float",
	"sender_name": "string",
	"receiver_name": "string",
	"postal_code": "string",
	},
	indent=2,
	),
	],
	],
	inputs=[file_input, url_input, template_input],
	label="Try these examples",
	)

	# Connect the extraction function
	extract_btn.click(
	fn=process_extraction,
	inputs=[file_input, url_input, template_input],
	outputs=output_json,
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()