Spaces:

Didier
/

Docling_VLM_OCR

Running

File size: 9,929 Bytes

60f1781

"""
File: docling_app.py

This module provides a document processing interface using Docling and VLM OCR.

:author: Didier Guillevic
:email: didier.guillevic@gmail.com
:date: 2026-02-27
:license: Apache License 2.0
"""
import logging
import gradio as gr
import json
from pathlib import Path
from typing import Optional, Any
import os

mistral_api_key = os.environ["MISTRAL_API_KEY"]

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream

# Import our local custom provider
from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel
from PIL import Image

# Setup logging
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

def generate_preview(file_path: str):
    if not file_path:
        return None
    
    path = Path(file_path)
    # Check if image
    if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
        return [Image.open(path)]
    
    # If PDF, extract pages using Docling's backend (which is already a dependency)
    if path.suffix.lower() == ".pdf":
        from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
        from docling.datamodel.base_models import DocumentStream
        
        try:
            with open(path, "rb") as f:
                stream = DocumentStream(name=path.name, stream=f)
                backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium
                
                pages = []
                for i in range(backend.page_count()):
                    page_image = backend.get_page_image(i)
                    pages.append(page_image)
                return pages
        except Exception as e:
            _log.error(f"Error generating preview: {e}")
            return None
    return None

def process_document(file_path: str, extract_json: bool):
    if not file_path:
        # Returning path as None for the file component
        yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
        return
    
    _log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}")
    reset_cancel()
    
    # Configure pipeline options
    prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate."
    if extract_json:
        prompt = (
            "Extract the information from this document into a structured JSON format. "
            "For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', "
            "'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). "
            "Return ONLY the JSON object."
        )

    ocr_options = VlmOcrOptions(
        model="mistral-medium-latest",
        openai_base_url="https://api.mistral.ai/v1",
        openai_api_key=mistral_api_key,
        prompt=prompt,
        timeout=300.0
    )

    pipeline_options = PdfPipelineOptions()
    pipeline_options.ocr_options = ocr_options
    pipeline_options.do_ocr = True

    # Initialize DocumentConverter with our custom pipeline
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=LocalVlmPdfPipeline,
                pipeline_options=pipeline_options
            ),
            InputFormat.IMAGE: PdfFormatOption(
                pipeline_cls=LocalVlmPdfPipeline,
                pipeline_options=pipeline_options
            ),
        }
    )

    try:
        # Process the document
        result = converter.convert(file_path)
        output_text = result.document.export_to_markdown()
        
        # Strip triple backticks if present
        cleaned_text = output_text.strip()
        if cleaned_text.startswith("```"):
            lines = cleaned_text.splitlines()
            if lines[0].startswith("```"):
                # If it's JSON, the first line might be ```json
                lines = lines[1:]
            if lines and lines[-1].strip() == "```":
                lines = lines[:-1]
            cleaned_text = "\n".join(lines).strip()

        # Determine output filename
        input_path = Path(file_path)
        ext = ".json" if extract_json else ".md"
        output_filename = input_path.stem + ext
        output_path = input_path.parent / output_filename
        
        with open(output_path, "w") as f:
            f.write(cleaned_text)
        
        _log.info(f"Result saved to {output_path}")

        # Prepare JSON output if requested
        json_output = None
        if extract_json:
            import re
            try:
                # 1. Try to find content within triple backticks
                json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", output_text)
                if json_match:
                    json_str = json_match.group(1).strip()
                else:
                    # 2. Try to find the first '{' and last '}'
                    json_str_match = re.search(r"(\{[\s\S]*\})", output_text)
                    if json_str_match:
                        json_str = json_str_match.group(1).strip()
                    else:
                        json_str = output_text.strip()
                
                # 3. Clean up the JSON string
                # Remove Markdown escaped underscores
                json_str = json_str.replace("\\_", "_")
                # Remove single line comments (but be careful not to remove http:// urls)
                # This regex looks for // that is not preceded by :
                json_str = re.sub(r"(?<!:)\/\/.*", "", json_str)
                
                json_output = json.loads(json_str)
            except Exception as je:
                _log.warning(f"Could not parse result as JSON: {je}")
                # Fallback to a dictionary showing the failure
                json_output = {"error": "Invalid JSON format", "raw": output_text}

        yield (
            cleaned_text, 
            json_output,
            gr.update(value="Process Document", variant="primary", interactive=True), 
            gr.update(visible=False), 
            str(output_path)
        )
    except Exception as e:
        _log.error(f"Error processing document: {e}")
        yield f"Error: {str(e)}", None, gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None

def start_processing():
    return (
        gr.update(value="Processing...", variant="secondary", interactive=False), 
        gr.update(visible=True),
        None # Clear previous download file
    )

def handle_stop():
    request_cancel()
    return gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False)

def clear_interface():
    return (
        None,   # input_file
        [],     # preview_gallery
        None,   # output_file
        "",     # output_markdown
        None    # output_json
    )

# Create Gradio interface
with gr.Blocks(title="Docling VLM OCR", theme=gr.themes.Default()) as demo:
    gr.Markdown("# 📄 Docling VLM OCR")
    gr.Markdown("Upload an image or a PDF file to extract text or structured data.")
    
    with gr.Row():
        input_file = gr.File(
            label="1. Upload File",
            file_types=[".pdf", ".png", ".jpg", ".jpeg"],
            scale=1,
        )
        # Specifying height and preview=True for better interaction
        preview_gallery = gr.Gallery(
            label="Input Preview", 
            columns=1, 
            height=250, 
            object_fit="contain",
            preview=True,
            allow_preview=True,
            scale=2,
        )
    
    extract_json_chk = gr.Checkbox(label="2. Extract as Structured JSON", value=False)
    
    with gr.Row():
        submit_btn = gr.Button("3. Process Document", variant="primary")
        stop_btn = gr.Button("Stop", variant="stop", visible=False)
        clear_btn = gr.Button("Clear", variant="secondary")
    
    output_file = gr.File(label="4. Download Result", interactive=False)
    
    with gr.Column():
        output_markdown = gr.Markdown(label="OCR Result (Markdown)", visible=not extract_json_chk.value)
        output_json = gr.JSON(label="OCR Result (JSON)", visible=extract_json_chk.value)
    
    # Toggle visibility of output components
    def toggle_outputs(is_json):
        return (
            gr.update(visible=not is_json),
            gr.update(visible=is_json)
        )
    
    extract_json_chk.change(
        fn=toggle_outputs,
        inputs=[extract_json_chk],
        outputs=[output_markdown, output_json]
    )
    
    # Auto-generate preview on upload
    input_file.change(
        fn=generate_preview,
        inputs=[input_file],
        outputs=[preview_gallery]
    )
    
    # We use a trick to update the button state before starting the long-running task
    submit_event = submit_btn.click(
        fn=start_processing,
        outputs=[submit_btn, stop_btn, output_file]
    ).then(
        fn=process_document,
        inputs=[input_file, extract_json_chk],
        outputs=[output_markdown, output_json, submit_btn, stop_btn, output_file]
    )
    
    # Implementation of stop button - sets the internal flag and cancels the Gradio event
    stop_btn.click(
        fn=handle_stop,
        inputs=None,
        outputs=[submit_btn, stop_btn],
        cancels=[submit_event]
    )

    # Clear button logic
    clear_btn.click(
        fn=clear_interface,
        inputs=None,
        outputs=[input_file, preview_gallery, output_file, output_markdown, output_json]
    )

if __name__ == "__main__":
    demo.queue().launch()