Spaces:

Didier
/

Docling_VLM_OCR

Running

App Files Files Community

Didier commited on Feb 28

Commit

60f1781

verified ·

1 Parent(s): 58ba391

Create app.py

Browse files

Files changed (1) hide show

app.py +275 -0

app.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+File: docling_app.py
+This module provides a document processing interface using Docling and VLM OCR.
+:author: Didier Guillevic
+:email: didier.guillevic@gmail.com
+:date: 2026-02-27
+:license: Apache License 2.0
+"""
+import logging
+import gradio as gr
+import json
+from pathlib import Path
+from typing import Optional, Any
+import os
+mistral_api_key = os.environ["MISTRAL_API_KEY"]
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream
+# Import our local custom provider
+from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel
+from PIL import Image
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+_log = logging.getLogger(__name__)
+def generate_preview(file_path: str):
+    if not file_path:
+        return None
+    path = Path(file_path)
+    # Check if image
+    if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
+        return [Image.open(path)]
+    # If PDF, extract pages using Docling's backend (which is already a dependency)
+    if path.suffix.lower() == ".pdf":
+        from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+        from docling.datamodel.base_models import DocumentStream
+        try:
+            with open(path, "rb") as f:
+                stream = DocumentStream(name=path.name, stream=f)
+                backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium
+                pages = []
+                for i in range(backend.page_count()):
+                    page_image = backend.get_page_image(i)
+                    pages.append(page_image)
+                return pages
+        except Exception as e:
+            _log.error(f"Error generating preview: {e}")
+            return None
+    return None
+def process_document(file_path: str, extract_json: bool):
+    if not file_path:
+        # Returning path as None for the file component
+        yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
+        return
+    _log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}")
+    reset_cancel()
+    # Configure pipeline options
+    prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate."
+    if extract_json:
+        prompt = (
+            "Extract the information from this document into a structured JSON format. "
+            "For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', "
+            "'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). "
+            "Return ONLY the JSON object."
+        )
+    ocr_options = VlmOcrOptions(
+        model="mistral-medium-latest",
+        openai_base_url="https://api.mistral.ai/v1",
+        openai_api_key=mistral_api_key,
+        prompt=prompt,
+        timeout=300.0
+    )
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.ocr_options = ocr_options
+    pipeline_options.do_ocr = True
+    # Initialize DocumentConverter with our custom pipeline
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=LocalVlmPdfPipeline,
+                pipeline_options=pipeline_options
+            ),
+            InputFormat.IMAGE: PdfFormatOption(
+                pipeline_cls=LocalVlmPdfPipeline,
+                pipeline_options=pipeline_options
+            ),
+        }
+    )
+    try:
+        # Process the document
+        result = converter.convert(file_path)
+        output_text = result.document.export_to_markdown()
+        # Strip triple backticks if present
+        cleaned_text = output_text.strip()
+        if cleaned_text.startswith("```"):
+            lines = cleaned_text.splitlines()
+            if lines[0].startswith("```"):
+                # If it's JSON, the first line might be ```json
+                lines = lines[1:]
+            if lines and lines[-1].strip() == "```":
+                lines = lines[:-1]
+            cleaned_text = "\n".join(lines).strip()
+        # Determine output filename
+        input_path = Path(file_path)
+        ext = ".json" if extract_json else ".md"
+        output_filename = input_path.stem + ext
+        output_path = input_path.parent / output_filename
+        with open(output_path, "w") as f:
+            f.write(cleaned_text)
+        _log.info(f"Result saved to {output_path}")
+        # Prepare JSON output if requested
+        json_output = None
+        if extract_json:
+            import re
+            try:
+                # 1. Try to find content within triple backticks
+                json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", output_text)
+                if json_match:
+                    json_str = json_match.group(1).strip()
+                else:
+                    # 2. Try to find the first '{' and last '}'
+                    json_str_match = re.search(r"(\{[\s\S]*\})", output_text)
+                    if json_str_match:
+                        json_str = json_str_match.group(1).strip()
+                    else:
+                        json_str = output_text.strip()
+                # 3. Clean up the JSON string
+                # Remove Markdown escaped underscores
+                json_str = json_str.replace("\\_", "_")
+                # Remove single line comments (but be careful not to remove http:// urls)
+                # This regex looks for // that is not preceded by :
+                json_str = re.sub(r"(?<!:)\/\/.*", "", json_str)
+                json_output = json.loads(json_str)
+            except Exception as je:
+                _log.warning(f"Could not parse result as JSON: {je}")
+                # Fallback to a dictionary showing the failure
+                json_output = {"error": "Invalid JSON format", "raw": output_text}
+        yield (
+            cleaned_text,
+            json_output,
+            gr.update(value="Process Document", variant="primary", interactive=True),
+            gr.update(visible=False),
+            str(output_path)
+        )
+    except Exception as e:
+        _log.error(f"Error processing document: {e}")
+        yield f"Error: {str(e)}", None, gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
+def start_processing():
+    return (
+        gr.update(value="Processing...", variant="secondary", interactive=False),
+        gr.update(visible=True),
+        None # Clear previous download file
+    )
+def handle_stop():
+    request_cancel()
+    return gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False)
+def clear_interface():
+    return (
+        None,   # input_file
+        [],     # preview_gallery
+        None,   # output_file
+        "",     # output_markdown
+        None    # output_json
+    )
+# Create Gradio interface
+with gr.Blocks(title="Docling VLM OCR", theme=gr.themes.Default()) as demo:
+    gr.Markdown("# 📄 Docling VLM OCR")
+    gr.Markdown("Upload an image or a PDF file to extract text or structured data.")
+    with gr.Row():
+        input_file = gr.File(
+            label="1. Upload File",
+            file_types=[".pdf", ".png", ".jpg", ".jpeg"],
+            scale=1,
+        )
+        # Specifying height and preview=True for better interaction
+        preview_gallery = gr.Gallery(
+            label="Input Preview",
+            columns=1,
+            height=250,
+            object_fit="contain",
+            preview=True,
+            allow_preview=True,
+            scale=2,
+        )
+    extract_json_chk = gr.Checkbox(label="2. Extract as Structured JSON", value=False)
+    with gr.Row():
+        submit_btn = gr.Button("3. Process Document", variant="primary")
+        stop_btn = gr.Button("Stop", variant="stop", visible=False)
+        clear_btn = gr.Button("Clear", variant="secondary")
+    output_file = gr.File(label="4. Download Result", interactive=False)
+    with gr.Column():
+        output_markdown = gr.Markdown(label="OCR Result (Markdown)", visible=not extract_json_chk.value)
+        output_json = gr.JSON(label="OCR Result (JSON)", visible=extract_json_chk.value)
+    # Toggle visibility of output components
+    def toggle_outputs(is_json):
+        return (
+            gr.update(visible=not is_json),
+            gr.update(visible=is_json)
+        )
+    extract_json_chk.change(
+        fn=toggle_outputs,
+        inputs=[extract_json_chk],
+        outputs=[output_markdown, output_json]
+    )
+    # Auto-generate preview on upload
+    input_file.change(
+        fn=generate_preview,
+        inputs=[input_file],
+        outputs=[preview_gallery]
+    )
+    # We use a trick to update the button state before starting the long-running task
+    submit_event = submit_btn.click(
+        fn=start_processing,
+        outputs=[submit_btn, stop_btn, output_file]
+    ).then(
+        fn=process_document,
+        inputs=[input_file, extract_json_chk],
+        outputs=[output_markdown, output_json, submit_btn, stop_btn, output_file]
+    )
+    # Implementation of stop button - sets the internal flag and cancels the Gradio event
+    stop_btn.click(
+        fn=handle_stop,
+        inputs=None,
+        outputs=[submit_btn, stop_btn],
+        cancels=[submit_event]
+    )
+    # Clear button logic
+    clear_btn.click(
+        fn=clear_interface,
+        inputs=None,
+        outputs=[input_file, preview_gallery, output_file, output_markdown, output_json]
+    )
+if __name__ == "__main__":
+    demo.queue().launch()