""" File: docling_app.py This module provides a document processing interface using Docling and VLM OCR. :author: Didier Guillevic :email: didier.guillevic@gmail.com :date: 2026-02-27 :license: Apache License 2.0 """ import logging import gradio as gr import json from pathlib import Path from typing import Optional, Any import os mistral_api_key = os.environ["MISTRAL_API_KEY"] from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream # Import our local custom provider from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel from PIL import Image # Setup logging logging.basicConfig(level=logging.INFO) _log = logging.getLogger(__name__) def generate_preview(file_path: str): if not file_path: return None path = Path(file_path) # Check if image if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]: return [Image.open(path)] # If PDF, extract pages using Docling's backend (which is already a dependency) if path.suffix.lower() == ".pdf": from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import DocumentStream try: with open(path, "rb") as f: stream = DocumentStream(name=path.name, stream=f) backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium pages = [] for i in range(backend.page_count()): page_image = backend.get_page_image(i) pages.append(page_image) return pages except Exception as e: _log.error(f"Error generating preview: {e}") return None return None def process_document(file_path: str, extract_json: bool): if not file_path: # Returning path as None for the file component yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None return _log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}") reset_cancel() # Configure pipeline options prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate." if extract_json: prompt = ( "Extract the information from this document into a structured JSON format. " "For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', " "'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). " "Return ONLY the JSON object." ) ocr_options = VlmOcrOptions( model="mistral-medium-latest", openai_base_url="https://api.mistral.ai/v1", openai_api_key=mistral_api_key, prompt=prompt, timeout=300.0 ) pipeline_options = PdfPipelineOptions() pipeline_options.ocr_options = ocr_options pipeline_options.do_ocr = True # Initialize DocumentConverter with our custom pipeline converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_cls=LocalVlmPdfPipeline, pipeline_options=pipeline_options ), InputFormat.IMAGE: PdfFormatOption( pipeline_cls=LocalVlmPdfPipeline, pipeline_options=pipeline_options ), } ) try: # Process the document result = converter.convert(file_path) output_text = result.document.export_to_markdown() # Strip triple backticks if present cleaned_text = output_text.strip() if cleaned_text.startswith("```"): lines = cleaned_text.splitlines() if lines[0].startswith("```"): # If it's JSON, the first line might be ```json lines = lines[1:] if lines and lines[-1].strip() == "```": lines = lines[:-1] cleaned_text = "\n".join(lines).strip() # Determine output filename input_path = Path(file_path) ext = ".json" if extract_json else ".md" output_filename = input_path.stem + ext output_path = input_path.parent / output_filename with open(output_path, "w") as f: f.write(cleaned_text) _log.info(f"Result saved to {output_path}") # Prepare JSON output if requested json_output = None if extract_json: import re try: # 1. Try to find content within triple backticks json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", output_text) if json_match: json_str = json_match.group(1).strip() else: # 2. Try to find the first '{' and last '}' json_str_match = re.search(r"(\{[\s\S]*\})", output_text) if json_str_match: json_str = json_str_match.group(1).strip() else: json_str = output_text.strip() # 3. Clean up the JSON string # Remove Markdown escaped underscores json_str = json_str.replace("\\_", "_") # Remove single line comments (but be careful not to remove http:// urls) # This regex looks for // that is not preceded by : json_str = re.sub(r"(?