yasserrmd's picture
Update app.py
39ea117 verified
raw
history blame
1.76 kB
import gradio as gr
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
import spaces
# GPU decorator not really required for Docling OCR, but kept if you want
@spaces.GPU
def convert_document(file, output_format):
pdf_opts = PdfPipelineOptions(
do_ocr=True,
ocr_options=TesseractCliOcrOptions(lang=["eng"]),
)
converter = DocumentConverter(pipeline_options=pdf_opts)
result = converter.convert(file.name)
# Choose output format safely
if output_format == "Markdown":
converted_text = result.document.export_to_markdown()
elif output_format == "JSON":
# JSON needs to be dumped into a string for the Textbox
import json
converted_text = json.dumps(result.document.export_to_json(), indent=2)
else:
converted_text = "⚠️ Unsupported format"
# Metadata always JSON-friendly
metadata = {"Available Attributes": dir(result.document)}
return converted_text, metadata
with gr.Blocks() as app:
gr.Markdown("# 📄 Document Converter with Docling OCR")
gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")
with gr.Row():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")
output_text = gr.Textbox(label="Converted Document", lines=20)
output_metadata = gr.JSON(label="Metadata")
convert_button = gr.Button("Convert")
convert_button.click(
fn=convert_document,
inputs=[file_input, format_input],
outputs=[output_text, output_metadata]
)
app.launch(debug=True)