Spaces:

yasserrmd
/

DoclingConverter

Running on Zero

App Files Files Community

DoclingConverter / app.py

yasserrmd

Update app.py

39ea117 verified 4 months ago

raw

history blame

1.76 kB

	import gradio as gr
	from docling.document_converter import DocumentConverter
	from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
	import spaces

	# GPU decorator not really required for Docling OCR, but kept if you want
	@spaces.GPU
	def convert_document(file, output_format):
	pdf_opts = PdfPipelineOptions(
	do_ocr=True,
	ocr_options=TesseractCliOcrOptions(lang=["eng"]),
	)

	converter = DocumentConverter(pipeline_options=pdf_opts)
	result = converter.convert(file.name)

	# Choose output format safely
	if output_format == "Markdown":
	converted_text = result.document.export_to_markdown()
	elif output_format == "JSON":
	# JSON needs to be dumped into a string for the Textbox
	import json
	converted_text = json.dumps(result.document.export_to_json(), indent=2)
	else:
	converted_text = "⚠️ Unsupported format"

	# Metadata always JSON-friendly
	metadata = {"Available Attributes": dir(result.document)}

	return converted_text, metadata


	with gr.Blocks() as app:
	gr.Markdown("# 📄 Document Converter with Docling OCR")
	gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")

	with gr.Row():
	file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")

	output_text = gr.Textbox(label="Converted Document", lines=20)
	output_metadata = gr.JSON(label="Metadata")

	convert_button = gr.Button("Convert")
	convert_button.click(
	fn=convert_document,
	inputs=[file_input, format_input],
	outputs=[output_text, output_metadata]
	)

	app.launch(debug=True)