Spaces:

Didier
/

Docling_VLM_OCR

Running

App Files Files Community

Docling_VLM_OCR / app.py

Didier

Create app.py

60f1781 verified about 1 month ago

raw

history blame contribute delete

9.93 kB

	"""
	File: docling_app.py

	This module provides a document processing interface using Docling and VLM OCR.

	:author: Didier Guillevic
	:email: didier.guillevic@gmail.com
	:date: 2026-02-27
	:license: Apache License 2.0
	"""
	import logging
	import gradio as gr
	import json
	from pathlib import Path
	from typing import Optional, Any
	import os

	mistral_api_key = os.environ["MISTRAL_API_KEY"]

	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream

	# Import our local custom provider
	from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel
	from PIL import Image

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	_log = logging.getLogger(__name__)

	def generate_preview(file_path: str):
	if not file_path:
	return None

	path = Path(file_path)
	# Check if image
	if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
	return [Image.open(path)]

	# If PDF, extract pages using Docling's backend (which is already a dependency)
	if path.suffix.lower() == ".pdf":
	from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
	from docling.datamodel.base_models import DocumentStream

	try:
	with open(path, "rb") as f:
	stream = DocumentStream(name=path.name, stream=f)
	backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium

	pages = []
	for i in range(backend.page_count()):
	page_image = backend.get_page_image(i)
	pages.append(page_image)
	return pages
	except Exception as e:
	_log.error(f"Error generating preview: {e}")
	return None
	return None

	def process_document(file_path: str, extract_json: bool):
	if not file_path:
	# Returning path as None for the file component
	yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
	return

	_log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}")
	reset_cancel()

	# Configure pipeline options
	prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate."
	if extract_json:
	prompt = (
	"Extract the information from this document into a structured JSON format. "
	"For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', "
	"'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). "
	"Return ONLY the JSON object."
	)

	ocr_options = VlmOcrOptions(
	model="mistral-medium-latest",
	openai_base_url="https://api.mistral.ai/v1",
	openai_api_key=mistral_api_key,
	prompt=prompt,
	timeout=300.0
	)

	pipeline_options = PdfPipelineOptions()
	pipeline_options.ocr_options = ocr_options
	pipeline_options.do_ocr = True

	# Initialize DocumentConverter with our custom pipeline
	converter = DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_cls=LocalVlmPdfPipeline,
	pipeline_options=pipeline_options
	),
	InputFormat.IMAGE: PdfFormatOption(
	pipeline_cls=LocalVlmPdfPipeline,
	pipeline_options=pipeline_options
	),
	}
	)

	try:
	# Process the document
	result = converter.convert(file_path)
	output_text = result.document.export_to_markdown()

	# Strip triple backticks if present
	cleaned_text = output_text.strip()
	if cleaned_text.startswith("```"):
	lines = cleaned_text.splitlines()
	if lines[0].startswith("```"):
	# If it's JSON, the first line might be ```json
	lines = lines[1:]
	if lines and lines[-1].strip() == "```":
	lines = lines[:-1]
	cleaned_text = "\n".join(lines).strip()

	# Determine output filename
	input_path = Path(file_path)
	ext = ".json" if extract_json else ".md"
	output_filename = input_path.stem + ext
	output_path = input_path.parent / output_filename

	with open(output_path, "w") as f:
	f.write(cleaned_text)

	_log.info(f"Result saved to {output_path}")

	# Prepare JSON output if requested
	json_output = None
	if extract_json:
	import re
	try:
	# 1. Try to find content within triple backticks
	json_match = re.search(r"```(?:json)?\s([\s\S]?)\s*```", output_text)
	if json_match:
	json_str = json_match.group(1).strip()
	else:
	# 2. Try to find the first '{' and last '}'
	json_str_match = re.search(r"(\{[\s\S]*\})", output_text)
	if json_str_match:
	json_str = json_str_match.group(1).strip()
	else:
	json_str = output_text.strip()

	# 3. Clean up the JSON string
	# Remove Markdown escaped underscores
	json_str = json_str.replace("\\_", "_")
	# Remove single line comments (but be careful not to remove http:// urls)
	# This regex looks for // that is not preceded by :
	json_str = re.sub(r"(?<!:)\/\/.*", "", json_str)

	json_output = json.loads(json_str)
	except Exception as je:
	_log.warning(f"Could not parse result as JSON: {je}")
	# Fallback to a dictionary showing the failure
	json_output = {"error": "Invalid JSON format", "raw": output_text}

	yield (
	cleaned_text,
	json_output,
	gr.update(value="Process Document", variant="primary", interactive=True),
	gr.update(visible=False),
	str(output_path)
	)
	except Exception as e:
	_log.error(f"Error processing document: {e}")
	yield f"Error: {str(e)}", None, gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None

	def start_processing():
	return (
	gr.update(value="Processing...", variant="secondary", interactive=False),
	gr.update(visible=True),
	None # Clear previous download file
	)

	def handle_stop():
	request_cancel()
	return gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False)

	def clear_interface():
	return (
	None, # input_file
	[], # preview_gallery
	None, # output_file
	"", # output_markdown
	None # output_json
	)

	# Create Gradio interface
	with gr.Blocks(title="Docling VLM OCR", theme=gr.themes.Default()) as demo:
	gr.Markdown("# 📄 Docling VLM OCR")
	gr.Markdown("Upload an image or a PDF file to extract text or structured data.")

	with gr.Row():
	input_file = gr.File(
	label="1. Upload File",
	file_types=[".pdf", ".png", ".jpg", ".jpeg"],
	scale=1,
	)
	# Specifying height and preview=True for better interaction
	preview_gallery = gr.Gallery(
	label="Input Preview",
	columns=1,
	height=250,
	object_fit="contain",
	preview=True,
	allow_preview=True,
	scale=2,
	)

	extract_json_chk = gr.Checkbox(label="2. Extract as Structured JSON", value=False)

	with gr.Row():
	submit_btn = gr.Button("3. Process Document", variant="primary")
	stop_btn = gr.Button("Stop", variant="stop", visible=False)
	clear_btn = gr.Button("Clear", variant="secondary")

	output_file = gr.File(label="4. Download Result", interactive=False)

	with gr.Column():
	output_markdown = gr.Markdown(label="OCR Result (Markdown)", visible=not extract_json_chk.value)
	output_json = gr.JSON(label="OCR Result (JSON)", visible=extract_json_chk.value)

	# Toggle visibility of output components
	def toggle_outputs(is_json):
	return (
	gr.update(visible=not is_json),
	gr.update(visible=is_json)
	)

	extract_json_chk.change(
	fn=toggle_outputs,
	inputs=[extract_json_chk],
	outputs=[output_markdown, output_json]
	)

	# Auto-generate preview on upload
	input_file.change(
	fn=generate_preview,
	inputs=[input_file],
	outputs=[preview_gallery]
	)

	# We use a trick to update the button state before starting the long-running task
	submit_event = submit_btn.click(
	fn=start_processing,
	outputs=[submit_btn, stop_btn, output_file]
	).then(
	fn=process_document,
	inputs=[input_file, extract_json_chk],
	outputs=[output_markdown, output_json, submit_btn, stop_btn, output_file]
	)

	# Implementation of stop button - sets the internal flag and cancels the Gradio event
	stop_btn.click(
	fn=handle_stop,
	inputs=None,
	outputs=[submit_btn, stop_btn],
	cancels=[submit_event]
	)

	# Clear button logic
	clear_btn.click(
	fn=clear_interface,
	inputs=None,
	outputs=[input_file, preview_gallery, output_file, output_markdown, output_json]
	)

	if __name__ == "__main__":
	demo.queue().launch()