Spaces:
Running
Running
| """ | |
| File: docling_app.py | |
| This module provides a document processing interface using Docling and VLM OCR. | |
| :author: Didier Guillevic | |
| :email: didier.guillevic@gmail.com | |
| :date: 2026-02-27 | |
| :license: Apache License 2.0 | |
| """ | |
| import logging | |
| import gradio as gr | |
| import json | |
| from pathlib import Path | |
| from typing import Optional, Any | |
| import os | |
| mistral_api_key = os.environ["MISTRAL_API_KEY"] | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions | |
| from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream | |
| # Import our local custom provider | |
| from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel | |
| from PIL import Image | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| _log = logging.getLogger(__name__) | |
| def generate_preview(file_path: str): | |
| if not file_path: | |
| return None | |
| path = Path(file_path) | |
| # Check if image | |
| if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]: | |
| return [Image.open(path)] | |
| # If PDF, extract pages using Docling's backend (which is already a dependency) | |
| if path.suffix.lower() == ".pdf": | |
| from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | |
| from docling.datamodel.base_models import DocumentStream | |
| try: | |
| with open(path, "rb") as f: | |
| stream = DocumentStream(name=path.name, stream=f) | |
| backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium | |
| pages = [] | |
| for i in range(backend.page_count()): | |
| page_image = backend.get_page_image(i) | |
| pages.append(page_image) | |
| return pages | |
| except Exception as e: | |
| _log.error(f"Error generating preview: {e}") | |
| return None | |
| return None | |
| def process_document(file_path: str, extract_json: bool): | |
| if not file_path: | |
| # Returning path as None for the file component | |
| yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None | |
| return | |
| _log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}") | |
| reset_cancel() | |
| # Configure pipeline options | |
| prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate." | |
| if extract_json: | |
| prompt = ( | |
| "Extract the information from this document into a structured JSON format. " | |
| "For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', " | |
| "'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). " | |
| "Return ONLY the JSON object." | |
| ) | |
| ocr_options = VlmOcrOptions( | |
| model="mistral-medium-latest", | |
| openai_base_url="https://api.mistral.ai/v1", | |
| openai_api_key=mistral_api_key, | |
| prompt=prompt, | |
| timeout=300.0 | |
| ) | |
| pipeline_options = PdfPipelineOptions() | |
| pipeline_options.ocr_options = ocr_options | |
| pipeline_options.do_ocr = True | |
| # Initialize DocumentConverter with our custom pipeline | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_cls=LocalVlmPdfPipeline, | |
| pipeline_options=pipeline_options | |
| ), | |
| InputFormat.IMAGE: PdfFormatOption( | |
| pipeline_cls=LocalVlmPdfPipeline, | |
| pipeline_options=pipeline_options | |
| ), | |
| } | |
| ) | |
| try: | |
| # Process the document | |
| result = converter.convert(file_path) | |
| output_text = result.document.export_to_markdown() | |
| # Strip triple backticks if present | |
| cleaned_text = output_text.strip() | |
| if cleaned_text.startswith("```"): | |
| lines = cleaned_text.splitlines() | |
| if lines[0].startswith("```"): | |
| # If it's JSON, the first line might be ```json | |
| lines = lines[1:] | |
| if lines and lines[-1].strip() == "```": | |
| lines = lines[:-1] | |
| cleaned_text = "\n".join(lines).strip() | |
| # Determine output filename | |
| input_path = Path(file_path) | |
| ext = ".json" if extract_json else ".md" | |
| output_filename = input_path.stem + ext | |
| output_path = input_path.parent / output_filename | |
| with open(output_path, "w") as f: | |
| f.write(cleaned_text) | |
| _log.info(f"Result saved to {output_path}") | |
| # Prepare JSON output if requested | |
| json_output = None | |
| if extract_json: | |
| import re | |
| try: | |
| # 1. Try to find content within triple backticks | |
| json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", output_text) | |
| if json_match: | |
| json_str = json_match.group(1).strip() | |
| else: | |
| # 2. Try to find the first '{' and last '}' | |
| json_str_match = re.search(r"(\{[\s\S]*\})", output_text) | |
| if json_str_match: | |
| json_str = json_str_match.group(1).strip() | |
| else: | |
| json_str = output_text.strip() | |
| # 3. Clean up the JSON string | |
| # Remove Markdown escaped underscores | |
| json_str = json_str.replace("\\_", "_") | |
| # Remove single line comments (but be careful not to remove http:// urls) | |
| # This regex looks for // that is not preceded by : | |
| json_str = re.sub(r"(?<!:)\/\/.*", "", json_str) | |
| json_output = json.loads(json_str) | |
| except Exception as je: | |
| _log.warning(f"Could not parse result as JSON: {je}") | |
| # Fallback to a dictionary showing the failure | |
| json_output = {"error": "Invalid JSON format", "raw": output_text} | |
| yield ( | |
| cleaned_text, | |
| json_output, | |
| gr.update(value="Process Document", variant="primary", interactive=True), | |
| gr.update(visible=False), | |
| str(output_path) | |
| ) | |
| except Exception as e: | |
| _log.error(f"Error processing document: {e}") | |
| yield f"Error: {str(e)}", None, gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None | |
| def start_processing(): | |
| return ( | |
| gr.update(value="Processing...", variant="secondary", interactive=False), | |
| gr.update(visible=True), | |
| None # Clear previous download file | |
| ) | |
| def handle_stop(): | |
| request_cancel() | |
| return gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False) | |
| def clear_interface(): | |
| return ( | |
| None, # input_file | |
| [], # preview_gallery | |
| None, # output_file | |
| "", # output_markdown | |
| None # output_json | |
| ) | |
| # Create Gradio interface | |
| with gr.Blocks(title="Docling VLM OCR", theme=gr.themes.Default()) as demo: | |
| gr.Markdown("# 📄 Docling VLM OCR") | |
| gr.Markdown("Upload an image or a PDF file to extract text or structured data.") | |
| with gr.Row(): | |
| input_file = gr.File( | |
| label="1. Upload File", | |
| file_types=[".pdf", ".png", ".jpg", ".jpeg"], | |
| scale=1, | |
| ) | |
| # Specifying height and preview=True for better interaction | |
| preview_gallery = gr.Gallery( | |
| label="Input Preview", | |
| columns=1, | |
| height=250, | |
| object_fit="contain", | |
| preview=True, | |
| allow_preview=True, | |
| scale=2, | |
| ) | |
| extract_json_chk = gr.Checkbox(label="2. Extract as Structured JSON", value=False) | |
| with gr.Row(): | |
| submit_btn = gr.Button("3. Process Document", variant="primary") | |
| stop_btn = gr.Button("Stop", variant="stop", visible=False) | |
| clear_btn = gr.Button("Clear", variant="secondary") | |
| output_file = gr.File(label="4. Download Result", interactive=False) | |
| with gr.Column(): | |
| output_markdown = gr.Markdown(label="OCR Result (Markdown)", visible=not extract_json_chk.value) | |
| output_json = gr.JSON(label="OCR Result (JSON)", visible=extract_json_chk.value) | |
| # Toggle visibility of output components | |
| def toggle_outputs(is_json): | |
| return ( | |
| gr.update(visible=not is_json), | |
| gr.update(visible=is_json) | |
| ) | |
| extract_json_chk.change( | |
| fn=toggle_outputs, | |
| inputs=[extract_json_chk], | |
| outputs=[output_markdown, output_json] | |
| ) | |
| # Auto-generate preview on upload | |
| input_file.change( | |
| fn=generate_preview, | |
| inputs=[input_file], | |
| outputs=[preview_gallery] | |
| ) | |
| # We use a trick to update the button state before starting the long-running task | |
| submit_event = submit_btn.click( | |
| fn=start_processing, | |
| outputs=[submit_btn, stop_btn, output_file] | |
| ).then( | |
| fn=process_document, | |
| inputs=[input_file, extract_json_chk], | |
| outputs=[output_markdown, output_json, submit_btn, stop_btn, output_file] | |
| ) | |
| # Implementation of stop button - sets the internal flag and cancels the Gradio event | |
| stop_btn.click( | |
| fn=handle_stop, | |
| inputs=None, | |
| outputs=[submit_btn, stop_btn], | |
| cancels=[submit_event] | |
| ) | |
| # Clear button logic | |
| clear_btn.click( | |
| fn=clear_interface, | |
| inputs=None, | |
| outputs=[input_file, preview_gallery, output_file, output_markdown, output_json] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |