import gradio as gr
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
import spaces

# GPU decorator not really required for Docling OCR, but kept if you want
@spaces.GPU
def convert_document(file, output_format):
    pdf_opts = PdfPipelineOptions(
        do_ocr=True,
        ocr_options=TesseractCliOcrOptions(lang=["eng"]),  
    )

    converter = DocumentConverter(pipeline_options=pdf_opts)
    result = converter.convert(file.name)

    # Choose output format safely
    if output_format == "Markdown":
        converted_text = result.document.export_to_markdown()
    elif output_format == "JSON":
        # JSON needs to be dumped into a string for the Textbox
        import json
        converted_text = json.dumps(result.document.export_to_json(), indent=2)
    else:
        converted_text = "⚠️ Unsupported format"

    # Metadata always JSON-friendly
    metadata = {"Available Attributes": dir(result.document)}

    return converted_text, metadata


with gr.Blocks() as app:
    gr.Markdown("# 📄 Document Converter with Docling OCR")
    gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")

    with gr.Row():
        file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")

    output_text = gr.Textbox(label="Converted Document", lines=20)
    output_metadata = gr.JSON(label="Metadata")

    convert_button = gr.Button("Convert")
    convert_button.click(
        fn=convert_document,
        inputs=[file_input, format_input],
        outputs=[output_text, output_metadata]
    )

app.launch(debug=True)