Didier commited on
Commit
60f1781
·
verified ·
1 Parent(s): 58ba391

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +275 -0
app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: docling_app.py
3
+
4
+ This module provides a document processing interface using Docling and VLM OCR.
5
+
6
+ :author: Didier Guillevic
7
+ :email: didier.guillevic@gmail.com
8
+ :date: 2026-02-27
9
+ :license: Apache License 2.0
10
+ """
11
+ import logging
12
+ import gradio as gr
13
+ import json
14
+ from pathlib import Path
15
+ from typing import Optional, Any
16
+ import os
17
+
18
+ mistral_api_key = os.environ["MISTRAL_API_KEY"]
19
+
20
+ from docling.datamodel.base_models import InputFormat
21
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
22
+ from docling.document_converter import DocumentConverter, PdfFormatOption, DocumentStream
23
+
24
+ # Import our local custom provider
25
+ from vlm_ocr import VlmOcrModel, VlmOcrOptions, LocalVlmPdfPipeline, request_cancel, reset_cancel
26
+ from PIL import Image
27
+
28
+ # Setup logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ _log = logging.getLogger(__name__)
31
+
32
+ def generate_preview(file_path: str):
33
+ if not file_path:
34
+ return None
35
+
36
+ path = Path(file_path)
37
+ # Check if image
38
+ if path.suffix.lower() in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
39
+ return [Image.open(path)]
40
+
41
+ # If PDF, extract pages using Docling's backend (which is already a dependency)
42
+ if path.suffix.lower() == ".pdf":
43
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
44
+ from docling.datamodel.base_models import DocumentStream
45
+
46
+ try:
47
+ with open(path, "rb") as f:
48
+ stream = DocumentStream(name=path.name, stream=f)
49
+ backend = PyPdfiumDocumentBackend(Path(""), stream) # Path doesn't matter for pypdfium
50
+
51
+ pages = []
52
+ for i in range(backend.page_count()):
53
+ page_image = backend.get_page_image(i)
54
+ pages.append(page_image)
55
+ return pages
56
+ except Exception as e:
57
+ _log.error(f"Error generating preview: {e}")
58
+ return None
59
+ return None
60
+
61
+ def process_document(file_path: str, extract_json: bool):
62
+ if not file_path:
63
+ # Returning path as None for the file component
64
+ yield "No file uploaded.", gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
65
+ return
66
+
67
+ _log.info(f"Processing file: {file_path}, Extract JSON: {extract_json}")
68
+ reset_cancel()
69
+
70
+ # Configure pipeline options
71
+ prompt = "Transcribe the text in this image. Return only the transcription. Use standard Markdown table syntax for any tables found. Be extremely accurate."
72
+ if extract_json:
73
+ prompt = (
74
+ "Extract the information from this document into a structured JSON format. "
75
+ "For a payroll document, include keys like 'employee_name', 'employee_id', 'period_start', 'period_end', "
76
+ "'earnings' (a list of objects with type, hours, rate, amount), 'deductions', and 'summary' (gross_pay, net_pay). "
77
+ "Return ONLY the JSON object."
78
+ )
79
+
80
+ ocr_options = VlmOcrOptions(
81
+ model="mistral-medium-latest",
82
+ openai_base_url="https://api.mistral.ai/v1",
83
+ openai_api_key=mistral_api_key,
84
+ prompt=prompt,
85
+ timeout=300.0
86
+ )
87
+
88
+ pipeline_options = PdfPipelineOptions()
89
+ pipeline_options.ocr_options = ocr_options
90
+ pipeline_options.do_ocr = True
91
+
92
+ # Initialize DocumentConverter with our custom pipeline
93
+ converter = DocumentConverter(
94
+ format_options={
95
+ InputFormat.PDF: PdfFormatOption(
96
+ pipeline_cls=LocalVlmPdfPipeline,
97
+ pipeline_options=pipeline_options
98
+ ),
99
+ InputFormat.IMAGE: PdfFormatOption(
100
+ pipeline_cls=LocalVlmPdfPipeline,
101
+ pipeline_options=pipeline_options
102
+ ),
103
+ }
104
+ )
105
+
106
+ try:
107
+ # Process the document
108
+ result = converter.convert(file_path)
109
+ output_text = result.document.export_to_markdown()
110
+
111
+ # Strip triple backticks if present
112
+ cleaned_text = output_text.strip()
113
+ if cleaned_text.startswith("```"):
114
+ lines = cleaned_text.splitlines()
115
+ if lines[0].startswith("```"):
116
+ # If it's JSON, the first line might be ```json
117
+ lines = lines[1:]
118
+ if lines and lines[-1].strip() == "```":
119
+ lines = lines[:-1]
120
+ cleaned_text = "\n".join(lines).strip()
121
+
122
+ # Determine output filename
123
+ input_path = Path(file_path)
124
+ ext = ".json" if extract_json else ".md"
125
+ output_filename = input_path.stem + ext
126
+ output_path = input_path.parent / output_filename
127
+
128
+ with open(output_path, "w") as f:
129
+ f.write(cleaned_text)
130
+
131
+ _log.info(f"Result saved to {output_path}")
132
+
133
+ # Prepare JSON output if requested
134
+ json_output = None
135
+ if extract_json:
136
+ import re
137
+ try:
138
+ # 1. Try to find content within triple backticks
139
+ json_match = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", output_text)
140
+ if json_match:
141
+ json_str = json_match.group(1).strip()
142
+ else:
143
+ # 2. Try to find the first '{' and last '}'
144
+ json_str_match = re.search(r"(\{[\s\S]*\})", output_text)
145
+ if json_str_match:
146
+ json_str = json_str_match.group(1).strip()
147
+ else:
148
+ json_str = output_text.strip()
149
+
150
+ # 3. Clean up the JSON string
151
+ # Remove Markdown escaped underscores
152
+ json_str = json_str.replace("\\_", "_")
153
+ # Remove single line comments (but be careful not to remove http:// urls)
154
+ # This regex looks for // that is not preceded by :
155
+ json_str = re.sub(r"(?<!:)\/\/.*", "", json_str)
156
+
157
+ json_output = json.loads(json_str)
158
+ except Exception as je:
159
+ _log.warning(f"Could not parse result as JSON: {je}")
160
+ # Fallback to a dictionary showing the failure
161
+ json_output = {"error": "Invalid JSON format", "raw": output_text}
162
+
163
+ yield (
164
+ cleaned_text,
165
+ json_output,
166
+ gr.update(value="Process Document", variant="primary", interactive=True),
167
+ gr.update(visible=False),
168
+ str(output_path)
169
+ )
170
+ except Exception as e:
171
+ _log.error(f"Error processing document: {e}")
172
+ yield f"Error: {str(e)}", None, gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False), None
173
+
174
+ def start_processing():
175
+ return (
176
+ gr.update(value="Processing...", variant="secondary", interactive=False),
177
+ gr.update(visible=True),
178
+ None # Clear previous download file
179
+ )
180
+
181
+ def handle_stop():
182
+ request_cancel()
183
+ return gr.update(value="Process Document", variant="primary", interactive=True), gr.update(visible=False)
184
+
185
+ def clear_interface():
186
+ return (
187
+ None, # input_file
188
+ [], # preview_gallery
189
+ None, # output_file
190
+ "", # output_markdown
191
+ None # output_json
192
+ )
193
+
194
+ # Create Gradio interface
195
+ with gr.Blocks(title="Docling VLM OCR", theme=gr.themes.Default()) as demo:
196
+ gr.Markdown("# 📄 Docling VLM OCR")
197
+ gr.Markdown("Upload an image or a PDF file to extract text or structured data.")
198
+
199
+ with gr.Row():
200
+ input_file = gr.File(
201
+ label="1. Upload File",
202
+ file_types=[".pdf", ".png", ".jpg", ".jpeg"],
203
+ scale=1,
204
+ )
205
+ # Specifying height and preview=True for better interaction
206
+ preview_gallery = gr.Gallery(
207
+ label="Input Preview",
208
+ columns=1,
209
+ height=250,
210
+ object_fit="contain",
211
+ preview=True,
212
+ allow_preview=True,
213
+ scale=2,
214
+ )
215
+
216
+ extract_json_chk = gr.Checkbox(label="2. Extract as Structured JSON", value=False)
217
+
218
+ with gr.Row():
219
+ submit_btn = gr.Button("3. Process Document", variant="primary")
220
+ stop_btn = gr.Button("Stop", variant="stop", visible=False)
221
+ clear_btn = gr.Button("Clear", variant="secondary")
222
+
223
+ output_file = gr.File(label="4. Download Result", interactive=False)
224
+
225
+ with gr.Column():
226
+ output_markdown = gr.Markdown(label="OCR Result (Markdown)", visible=not extract_json_chk.value)
227
+ output_json = gr.JSON(label="OCR Result (JSON)", visible=extract_json_chk.value)
228
+
229
+ # Toggle visibility of output components
230
+ def toggle_outputs(is_json):
231
+ return (
232
+ gr.update(visible=not is_json),
233
+ gr.update(visible=is_json)
234
+ )
235
+
236
+ extract_json_chk.change(
237
+ fn=toggle_outputs,
238
+ inputs=[extract_json_chk],
239
+ outputs=[output_markdown, output_json]
240
+ )
241
+
242
+ # Auto-generate preview on upload
243
+ input_file.change(
244
+ fn=generate_preview,
245
+ inputs=[input_file],
246
+ outputs=[preview_gallery]
247
+ )
248
+
249
+ # We use a trick to update the button state before starting the long-running task
250
+ submit_event = submit_btn.click(
251
+ fn=start_processing,
252
+ outputs=[submit_btn, stop_btn, output_file]
253
+ ).then(
254
+ fn=process_document,
255
+ inputs=[input_file, extract_json_chk],
256
+ outputs=[output_markdown, output_json, submit_btn, stop_btn, output_file]
257
+ )
258
+
259
+ # Implementation of stop button - sets the internal flag and cancels the Gradio event
260
+ stop_btn.click(
261
+ fn=handle_stop,
262
+ inputs=None,
263
+ outputs=[submit_btn, stop_btn],
264
+ cancels=[submit_event]
265
+ )
266
+
267
+ # Clear button logic
268
+ clear_btn.click(
269
+ fn=clear_interface,
270
+ inputs=None,
271
+ outputs=[input_file, preview_gallery, output_file, output_markdown, output_json]
272
+ )
273
+
274
+ if __name__ == "__main__":
275
+ demo.queue().launch()