Docling_VLM_OCR / app.py
Didier's picture
Create app.py
60f1781 verified
"""
File: docling_app.py
This module provides a document processing interface using Docling and VLM OCR.
:author: Didier Guillevic
:email: didier.guillevic@gmail.com
:date: 2026-02-27
:license: Apache License 2.0
"""
import logging
import gradio as gr
import json
from pathlib import Path
from typing import Optional, Any
import os
mistral_api_key = os.environ["MISTRAL_API_KEY"]
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream
# Import our local custom provider
from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel
from PIL import Image
# Setup logging
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)
def generate_preview(file_path: str):
if not file_path:
return None
path = Path(file_path)
# Check if image
if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
return [Image.open(path)]
# If PDF, extract pages using Docling's backend (which is already a dependency)
if path.suffix.lower() == ".pdf":
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream
try:
with open(path, "rb") as f:
stream = DocumentStream(name=path.name, stream=f)
backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium
pages = []
for i in range(backend.page_count()):
page_image = backend.get_page_image(i)
pages.append(page_image)
return pages
except Exception as e:
_log.error(f"Error generating preview: {e}")
return None
return None
def process_document(file_path: str, extract_json: bool):
if not file_path:
# Returning path as None for the file component
yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
return
_log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}")
reset_cancel()
# Configure pipeline options
prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate."
if extract_json:
prompt = (
"Extract the information from this document into a structured JSON format. "
"For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', "
"'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). "
"Return ONLY the JSON object."
)
ocr_options = VlmOcrOptions(
model="mistral-medium-latest",
openai_base_url="https://api.mistral.ai/v1",
openai_api_key=mistral_api_key,
prompt=prompt,
timeout=300.0
)
pipeline_options = PdfPipelineOptions()
pipeline_options.ocr_options = ocr_options
pipeline_options.do_ocr = True
# Initialize DocumentConverter with our custom pipeline
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=LocalVlmPdfPipeline,
pipeline_options=pipeline_options
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=LocalVlmPdfPipeline,
pipeline_options=pipeline_options
),
}
)
try:
# Process the document
result = converter.convert(file_path)
output_text = result.document.export_to_markdown()
# Strip triple backticks if present
cleaned_text = output_text.strip()
if cleaned_text.startswith("```"):
lines = cleaned_text.splitlines()
if lines[0].startswith("```"):
# If it's JSON, the first line might be ```json
lines = lines[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
cleaned_text = "\n".join(lines).strip()
# Determine output filename
input_path = Path(file_path)
ext = ".json" if extract_json else ".md"
output_filename = input_path.stem + ext
output_path = input_path.parent / output_filename
with open(output_path, "w") as f:
f.write(cleaned_text)
_log.info(f"Result saved to {output_path}")
# Prepare JSON output if requested
json_output = None
if extract_json:
import re
try:
# 1. Try to find content within triple backticks
json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", output_text)
if json_match:
json_str = json_match.group(1).strip()
else:
# 2. Try to find the first '{' and last '}'
json_str_match = re.search(r"(\{[\s\S]*\})", output_text)
if json_str_match:
json_str = json_str_match.group(1).strip()
else:
json_str = output_text.strip()
# 3. Clean up the JSON string
# Remove Markdown escaped underscores
json_str = json_str.replace("\\_", "_")
# Remove single line comments (but be careful not to remove http:// urls)
# This regex looks for // that is not preceded by :
json_str = re.sub(r"(?<!:)\/\/.*", "", json_str)
json_output = json.loads(json_str)
except Exception as je:
_log.warning(f"Could not parse result as JSON: {je}")
# Fallback to a dictionary showing the failure
json_output = {"error": "Invalid JSON format", "raw": output_text}
yield (
cleaned_text,
json_output,
gr.update(value="Process Document", variant="primary", interactive=True),
gr.update(visible=False),
str(output_path)
)
except Exception as e:
_log.error(f"Error processing document: {e}")
yield f"Error: {str(e)}", None, gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
def start_processing():
return (
gr.update(value="Processing...", variant="secondary", interactive=False),
gr.update(visible=True),
None # Clear previous download file
)
def handle_stop():
request_cancel()
return gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False)
def clear_interface():
return (
None, # input_file
[], # preview_gallery
None, # output_file
"", # output_markdown
None # output_json
)
# Create Gradio interface
with gr.Blocks(title="Docling VLM OCR", theme=gr.themes.Default()) as demo:
gr.Markdown("# 📄 Docling VLM OCR")
gr.Markdown("Upload an image or a PDF file to extract text or structured data.")
with gr.Row():
input_file = gr.File(
label="1. Upload File",
file_types=[".pdf", ".png", ".jpg", ".jpeg"],
scale=1,
)
# Specifying height and preview=True for better interaction
preview_gallery = gr.Gallery(
label="Input Preview",
columns=1,
height=250,
object_fit="contain",
preview=True,
allow_preview=True,
scale=2,
)
extract_json_chk = gr.Checkbox(label="2. Extract as Structured JSON", value=False)
with gr.Row():
submit_btn = gr.Button("3. Process Document", variant="primary")
stop_btn = gr.Button("Stop", variant="stop", visible=False)
clear_btn = gr.Button("Clear", variant="secondary")
output_file = gr.File(label="4. Download Result", interactive=False)
with gr.Column():
output_markdown = gr.Markdown(label="OCR Result (Markdown)", visible=not extract_json_chk.value)
output_json = gr.JSON(label="OCR Result (JSON)", visible=extract_json_chk.value)
# Toggle visibility of output components
def toggle_outputs(is_json):
return (
gr.update(visible=not is_json),
gr.update(visible=is_json)
)
extract_json_chk.change(
fn=toggle_outputs,
inputs=[extract_json_chk],
outputs=[output_markdown, output_json]
)
# Auto-generate preview on upload
input_file.change(
fn=generate_preview,
inputs=[input_file],
outputs=[preview_gallery]
)
# We use a trick to update the button state before starting the long-running task
submit_event = submit_btn.click(
fn=start_processing,
outputs=[submit_btn, stop_btn, output_file]
).then(
fn=process_document,
inputs=[input_file, extract_json_chk],
outputs=[output_markdown, output_json, submit_btn, stop_btn, output_file]
)
# Implementation of stop button - sets the internal flag and cancels the Gradio event
stop_btn.click(
fn=handle_stop,
inputs=None,
outputs=[submit_btn, stop_btn],
cancels=[submit_event]
)
# Clear button logic
clear_btn.click(
fn=clear_interface,
inputs=None,
outputs=[input_file, preview_gallery, output_file, output_markdown, output_json]
)
if __name__ == "__main__":
demo.queue().launch()