Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,762 Bytes
e2d728a 2803b23 8e024f6 e2d728a 39ea117 8e024f6 e2d728a 2803b23 39ea117 2803b23 39ea117 2803b23 e2d728a 4678d36 39ea117 e2d728a 39ea117 e2d728a 39ea117 e2d728a 39ea117 e2d728a 39ea117 a07d796 39ea117 e2d728a 39ea117 a07d796 39ea117 a07d796 e2d728a 39ea117 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import gradio as gr
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
import spaces
# GPU decorator not really required for Docling OCR, but kept if you want
@spaces.GPU
def convert_document(file, output_format):
pdf_opts = PdfPipelineOptions(
do_ocr=True,
ocr_options=TesseractCliOcrOptions(lang=["eng"]),
)
converter = DocumentConverter(pipeline_options=pdf_opts)
result = converter.convert(file.name)
# Choose output format safely
if output_format == "Markdown":
converted_text = result.document.export_to_markdown()
elif output_format == "JSON":
# JSON needs to be dumped into a string for the Textbox
import json
converted_text = json.dumps(result.document.export_to_json(), indent=2)
else:
converted_text = "⚠️ Unsupported format"
# Metadata always JSON-friendly
metadata = {"Available Attributes": dir(result.document)}
return converted_text, metadata
with gr.Blocks() as app:
gr.Markdown("# 📄 Document Converter with Docling OCR")
gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")
with gr.Row():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")
output_text = gr.Textbox(label="Converted Document", lines=20)
output_metadata = gr.JSON(label="Metadata")
convert_button = gr.Button("Convert")
convert_button.click(
fn=convert_document,
inputs=[file_input, format_input],
outputs=[output_text, output_metadata]
)
app.launch(debug=True)
|