heerjtdev commited on
Commit
1d6b971
·
verified ·
1 Parent(s): 5e11387

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -600
app.py CHANGED
@@ -1,615 +1,111 @@
1
- import base64
2
- import io
3
- import json
4
- import os
5
- import re
6
- import time
7
- import tempfile
8
- from typing import Dict, List, Tuple, Any, Optional
9
- from urllib.parse import urlparse
10
-
11
  import gradio as gr
12
- import numpy as np
13
- import requests
14
- from PIL import Image, ImageDraw, ImageFont
15
  from pdf2image import convert_from_path
 
 
16
 
17
- # --- PADDLEOCR INTEGRATION ---
18
- try:
19
- from paddleocr import PPStructureV3, draw_structure_result
20
- # Initialize the model globally once to avoid re-loading on every call
21
- # This uses the default layout and table recognition models (PP-StructureV3).
22
- # Setting show_log=False keeps the console clean.
23
- PADDLE_STRUCTURE_PIPELINE = PPStructureV3(
24
- layout=True,
25
- table=True,
26
- ocr=True,
27
- show_log=False
28
- )
29
- print("✅ Paddle Structure Model Initialized for Integrated Inference.")
30
- except ImportError:
31
- PADDLE_STRUCTURE_PIPELINE = None
32
- print("❌ PaddleOCR/PPStructureV3 not found. Inference will be disabled.")
33
- except Exception as e:
34
- PADDLE_STRUCTURE_PIPELINE = None
35
- print(f"❌ Error initializing PaddleOCR pipeline: {e}")
36
-
37
-
38
- # =========================
39
- # Config (API URLs are now obsolete but kept for reference)
40
- # =========================
41
- # DEFAULT_API_URL = os.environ.get("API_URL") # OBSOLETE
42
- # TOKEN = os.environ.get("TOKEN") # OBSOLETE
43
- LOGO_IMAGE_PATH = "./assets/logo.jpg"
44
- GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@400;700&display=swap' rel='stylesheet'>"
45
- LATEX_DELIMS = [
46
- {"left": "$$", "right": "$$", "display": True},
47
- {"left": "$", "right": "$", "display": False},
48
- {"left": "\\(", "right": "\\)", "display": False},
49
- {"left": "\\[", "right": "\\]", "display": True},
50
- ]
51
- # AUTH_HEADER and JSON_HEADERS are OBSOLETE but kept for file structure consistency
52
- AUTH_HEADER = {}
53
- JSON_HEADERS = {}
54
-
55
-
56
- # =========================
57
- # Utility Functions
58
- # =========================
59
-
60
- def _ensure_local_path(path_or_url: str) -> str:
61
- """Ensures the input is a local file path, downloading from URL if necessary."""
62
- if not path_or_url:
63
- raise ValueError("Input path is empty.")
64
-
65
- is_url = path_or_url.startswith(("http://", "https://"))
66
- if not is_url:
67
- return path_or_url # Already local file
68
-
69
- # Download remote URL to a temporary file
70
- try:
71
- r = requests.get(path_or_url, timeout=600)
72
- r.raise_for_status()
73
-
74
- # Use filename extension if available, otherwise default to .jpg
75
- ext = os.path.splitext(urlparse(path_or_url).path)[1].lower() or '.jpg'
76
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
77
- temp_file.write(r.content)
78
- temp_file.close()
79
- return temp_file.name
80
- except Exception as e:
81
- raise gr.Error(f"Error downloading image from URL: {e}")
82
-
83
-
84
- def image_to_base64_data_url(filepath: str) -> str:
85
- """Encodes a local image file to a Base64 data URL for HTML rendering."""
86
- try:
87
- # Prevent conversion attempt on PDFs which can be huge
88
- if filepath.lower().endswith('.pdf'):
89
- return ""
90
-
91
- ext = os.path.splitext(filepath)[1].lower()
92
- mime_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"}
93
- mime_type = mime_types.get(ext, "image/jpeg")
94
- with open(filepath, "rb") as image_file:
95
- encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
96
- return f"data:{mime_type};base64,{encoded_string}"
97
- except Exception as e:
98
- # print(f"Error encoding image to Base64: {e}")
99
- return ""
100
-
101
- def _to_html_img(pil_img: Image.Image) -> str:
102
- """Converts a PIL Image to a Base64 data URL string for HTML display."""
103
- buffered = io.BytesIO()
104
- pil_img.save(buffered, format="PNG")
105
- img_str = base64.b64encode(buffered.getvalue()).decode()
106
- return f'data:image/png;base64,{img_str}'
107
-
108
-
109
- def _escape_inequalities_in_math(md: str) -> str:
110
- """Escapes < and > inside math blocks to prevent markdown misinterpretation."""
111
- _MATH_PATTERNS = [
112
- re.compile(r"\$\$([\s\S]+?)\$\$"),
113
- re.compile(r"\$([^\$]+?)\$"),
114
- re.compile(r"\\\[([\s\S]+?)\\\]"),
115
- re.compile(r"\\\(([\s\S]+?)\\\)"),
116
- ]
117
-
118
- def fix(s: str) -> str:
119
- s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
120
- s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
121
- s = s.replace("<", r" \lt ").replace(">", r" \gt ")
122
- return s
123
-
124
- for pat in _MATH_PATTERNS:
125
- md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
126
- return md
127
-
128
- def _get_examples_from_dir(dir_path: str) -> List[List[str]]:
129
- """Loads example URLs (unchanged)."""
130
- BASE_URL = "https://paddle-model-ecology.bj.bcebos.com/PPOCRVL/dataset/examples"
131
- supported_exts = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
132
- examples = []
133
- if not os.path.exists(dir_path):
134
- print(f"Warning: example dir {dir_path} not found.")
135
- return []
136
- for filename in sorted(os.listdir(dir_path)):
137
- ext = os.path.splitext(filename)[1].lower()
138
- if ext in supported_exts:
139
- subdir = os.path.basename(dir_path.rstrip("/"))
140
- img_url = f"{BASE_URL}/{subdir}/{filename}"
141
- examples.append([img_url])
142
- return examples
143
-
144
- TARGETED_EXAMPLES_DIR = "examples/targeted"
145
- COMPLEX_EXAMPLES_DIR = "examples/complex"
146
- targeted_recognition_examples = _get_examples_from_dir(TARGETED_EXAMPLES_DIR)
147
- complex_document_examples = _get_examples_from_dir(COMPLEX_EXAMPLES_DIR)
148
-
149
- # =========================
150
- # UI Helpers
151
- # =========================
152
- def render_uploaded_image_div(path_or_url: str) -> str:
153
- """Renders the image or a PDF placeholder."""
154
- if not path_or_url:
155
- return ""
156
-
157
- is_url = path_or_url.startswith(("http://", "https://"))
158
- is_pdf = path_or_url.lower().endswith('.pdf')
159
-
160
- if is_pdf:
161
- return f"""<div style="text-align:center; padding: 20px; color:#888;">PDF file loaded. Use the page selector and click 'Extract...' to process.</div>"""
162
 
163
- src = path_or_url if is_url else image_to_base64_data_url(path_or_url)
164
- if not src:
165
- return "" # Handle case where local image B64 conversion failed
166
-
167
- return f"""
168
- <div class="uploaded-image">
169
- <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
170
- </div>
171
  """
 
172
 
173
- def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
174
- if path_or_url:
175
- html_content = render_uploaded_image_div(path_or_url)
176
- return gr.update(value=html_content, visible=True)
177
- else:
178
- return gr.update(value="", visible=False)
179
-
180
-
181
- # =========================
182
- # Core Inference Logic (Replaces API Calls)
183
- # =========================
184
-
185
- def _run_paddle_structure(local_path: str, is_doc_parsing: bool = True) -> Tuple[str, str, str]:
186
- """Runs PPStructureV3 prediction and formats the results."""
187
-
188
- if PADDLE_STRUCTURE_PIPELINE is None:
189
- raise gr.Error("PaddleOCR model is not loaded. Please check model initialization logs.")
190
-
191
- start_time = time.time()
192
-
193
- # 1. Run prediction
194
- # Note: PPStructureV3 processes images, not PDFs. local_path should be an image path.
195
- result_list = PADDLE_STRUCTURE_PIPELINE.predict(local_path)
196
-
197
- end_time = time.time()
198
- print(f"PaddleOCR Structure inference completed in {end_time - start_time:.2f} seconds.")
199
-
200
- if not result_list:
201
- return "No content recognized.", "<p>No visualization available.</p>", "{}"
202
-
203
- # We only process the first page/image in the list
204
- result = result_list[0]
205
-
206
- # 2. Markdown Output
207
- # PPStructureV3 can generate LaTeX/Markdown based on its components.
208
- # This is a simplification; full VL-model output formatting is complex.
209
- md_text = result.to_markdown()
210
-
211
- # 3. Visualization Image
212
- image = Image.open(local_path).convert('RGB')
213
- # draw_structure_result requires a system font (e.g., simfang.ttf or arial.ttf) to be accessible.
214
- try:
215
- vis_image = draw_structure_result(image, result, font_path="arial.ttf")
216
- except Exception:
217
- # Fallback if font isn't found
218
- vis_image = draw_structure_result(image, result)
219
-
220
- output_html = f'<img src="{_to_html_img(vis_image)}" alt="Detection Visualization" loading="lazy">'
221
-
222
- # 4. Raw JSON Output
223
- raw_json = json.dumps(result.to_dict(), indent=2, ensure_ascii=False)
224
-
225
- md_text = _escape_inequalities_in_math(md_text)
226
- return md_text or "(Empty result)", output_html, raw_json
227
-
228
- # --- Inference Handlers for Tabs 1 & 2 ---
229
-
230
- def handle_complex_doc(path_or_url: str, use_chart_recognition: bool, use_doc_unwarping: bool, use_doc_orientation_classify: bool) -> Tuple[str, str, str]:
231
- if not path_or_url:
232
- raise gr.Error("Please upload an image first.")
233
 
234
- local_path = _ensure_local_path(path_or_url)
235
- if local_path.lower().endswith('.pdf'):
236
- raise gr.Error("Document Parsing tab requires an image, not a PDF.")
237
-
238
- # Note: The switches (chart, unwarping, orientation) are ignored here because
239
- # the integrated PPStructureV3 pipeline does not expose simple toggles for them.
240
- # The complexity is handled internally by the model version loaded.
241
-
242
- return _run_paddle_structure(local_path, is_doc_parsing=True)
243
-
244
-
245
- def handle_targeted_recognition(path_or_url: str, prompt_choice: str) -> Tuple[str, str]:
246
- if not path_or_url:
247
- raise gr.Error("Please upload an image first.")
248
-
249
- local_path = _ensure_local_path(path_or_url)
250
- if local_path.lower().endswith('.pdf'):
251
- raise gr.Error("Element-level Recognition tab requires an image, not a PDF.")
252
-
253
- # Map the choice to the desired structure/recognition type (simplified mapping)
254
- mapping = {
255
- "Text Recognition": "text",
256
- "Formula Recognition": "formula",
257
- "Table Recognition": "table",
258
- "Chart Recognition": "chart",
259
- }
260
- target_type = mapping.get(prompt_choice, "text")
261
-
262
- # For integrated PPStructureV3, we run a full structure pass and let the model's
263
- # internal logic prioritize the recognition based on the input image content.
264
- md_preview, _, md_raw = _run_paddle_structure(local_path, is_doc_parsing=False)
265
-
266
- # In a real VL system, we'd use the 'prompt_choice' to focus the model output.
267
- # Here, we just return the full markdown and raw output.
268
-
269
- return md_preview, md_raw
270
-
271
 
272
- # --- Inference Handler for Tab 3: PDF & Structured Extraction ---
273
 
274
- def _pdf_to_page_image(pdf_path: str, page_num: int) -> Image.Image:
275
- """Converts a specific PDF page to a PIL Image."""
276
  try:
277
- pages = convert_from_path(pdf_path, dpi=300, first_page=page_num + 1, last_page=page_num + 1)
278
- if not pages:
279
- raise ValueError(f"Could not convert page {page_num} of PDF.")
280
- return pages[0]
281
- except Exception as e:
282
- raise gr.Error(f"Error processing PDF with pdf2image (Is Poppler installed?): {e}")
283
-
284
- def _draw_boxes_on_image(img: Image.Image, elements: List[Dict]) -> str:
285
- """Draws bounding boxes onto the PIL Image based on PPStructureV3 results."""
286
- draw = ImageDraw.Draw(img)
287
-
288
- try:
289
- # Use a common font or fall back
290
- font = ImageFont.truetype("arial.ttf", 16)
291
- except IOError:
292
- font = ImageFont.load_default()
293
-
294
- for item in elements:
295
- # The coordinates are expected in the format [x1, y1, x2, y2]
296
- bbox = item.get("box", []) # PPStructureV3 often uses 'box' key
297
- item_type = item.get("type", "text")
298
-
299
- if len(bbox) == 4:
300
- x1, y1, x2, y2 = bbox
301
-
302
- # Draw different colors for different types
303
- if item_type in ["figure", "title"]:
304
- color = "purple"
305
- width = 3
306
- elif item_type in ["table", "formula"]:
307
- color = "red"
308
- width = 2
309
- else: # text
310
- color = "green"
311
- width = 1
312
 
313
- draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=width)
 
314
 
315
- # Optional: Add type label
316
- # draw.text((x1 + 5, y1 - 15), item_type, fill=color, font=font)
317
 
318
- return _to_html_img(img)
319
-
320
- def handle_structured_extraction(pdf_path: Optional[str], page_num: int) -> Tuple[str, str, str]:
321
- if PADDLE_STRUCTURE_PIPELINE is None:
322
- raise gr.Error("PaddleOCR model is not loaded.")
323
-
324
- if not pdf_path or not pdf_path.lower().endswith('.pdf'):
325
- raise gr.Error("Please upload a PDF file for this feature.")
326
 
327
- print(f"Processing PDF: {pdf_path}, Page: {page_num}")
328
-
329
- # --- 1. Convert PDF Page to Image ---
330
- try:
331
- page_img = _pdf_to_page_image(pdf_path, page_num)
332
  except Exception as e:
333
- return f"Error: {e}", "Error during PDF conversion.", json.dumps({"error": str(e)}, indent=2)
334
-
335
- # --- 2. Save image to temp file for PPStructureV3 ---
336
- temp_img_path = tempfile.mktemp(suffix=".png")
337
- page_img.save(temp_img_path)
338
-
339
- try:
340
- # --- 3. Run PPStructureV3 inference ---
341
- result_list = PADDLE_STRUCTURE_PIPELINE.predict(temp_img_path)
342
-
343
- if not result_list:
344
- return "No content recognized on this PDF page.", "", "{}"
345
-
346
- # --- 4. Process Results ---
347
- result = result_list[0]
348
- elements = result.to_dict().get("res", [])
349
-
350
- # Extract LaTeX/Formulas
351
- all_latex = []
352
- for item in elements:
353
- if item.get("type") == "formula" and item.get("text"):
354
- # Wrap text with $$. PPStructureV3 often outputs raw LaTeX in the 'text' field.
355
- all_latex.append(f"$${item['text']}$$")
356
-
357
- latex_output = "\n\n".join(all_latex) if all_latex else "No formulas (LaTeX) found on this page."
358
-
359
- # --- 5. Draw Boxes for Visualization ---
360
- box_html = f'<img src="{_draw_boxes_on_image(page_img, elements)}" alt="Image with Bounding Boxes" loading="lazy">'
361
-
362
- # --- 6. Return Results ---
363
- return box_html, latex_output, json.dumps(result.to_dict(), indent=2, ensure_ascii=False)
364
-
365
- except Exception as e:
366
- raise gr.Error(f"PaddleOCR inference failed during PDF processing: {e}")
367
- finally:
368
- if os.path.exists(temp_img_path):
369
- os.remove(temp_img_path)
370
-
371
- def get_pdf_page_count(pdf_path):
372
- """Gets the total number of pages in the PDF."""
373
- if not pdf_path or not pdf_path.lower().endswith('.pdf'):
374
- return gr.update(maximum=0, value=0, interactive=False)
375
- try:
376
- # Load the whole PDF to get the exact count (inefficient but reliable with pdf2image)
377
- pages = convert_from_path(pdf_path, use_pdftocairo=True)
378
- count = len(pages)
379
- return gr.update(maximum=max(0, count - 1), value=0, interactive=True)
380
- except Exception as e:
381
- print(f"Warning: Could not determine PDF page count: {e}")
382
- return gr.update(maximum=0, value=0, interactive=False)
383
-
384
-
385
- # =========================
386
- # CSS & UI (Unchanged)
387
- # =========================
388
- custom_css = """
389
- body, .gradio-container { font-family: "Noto Sans SC", "Microsoft YaHei", "PingFang SC", sans-serif; }
390
- .app-header { text-align: center; max-width: 900px; margin: 0 auto 8px !important; }
391
- .gradio-container { padding: 4px 0 !important; }
392
- .gradio-container [data-testid="tabs"], .gradio-container .tabs { margin-top: 0 !important; }
393
- .gradio-container [data-testid="tabitem"], .gradio-container .tabitem { padding-top: 4px !important; }
394
- .quick-links { text-align: center; padding: 8px 0; border: 1px solid #e5e7eb; border-radius: 8px; margin: 8px auto; max-width: 900px; }
395
- .quick-links a { margin: 0 12px; font-size: 14px; font-weight: 600; color: #3b82f6; text-decoration: none; }
396
- .quick-links a:hover { text-decoration: underline; }
397
- .prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
398
- .prompt-grid button { height: 40px !important; padding: 0 12px !important; border-radius: 8px !important; font-weight: 600 !important; font-size: 13px !important; letter-spacing: 0.2px; }
399
- #image_preview_vl, #image_preview_doc, #image_preview_pdf { height: 400px !important; overflow: auto; }
400
- #image_preview_vl img, #image_preview_doc img, #vis_image_doc img, #box_vis_html img { width: 100% !important; height: auto !important; object-fit: contain !important; display: block; }
401
- #md_preview_vl, #md_preview_doc { max-height: 540px; min-height: 180px; overflow: auto; scrollbar-gutter: stable both-edges; }
402
- #md_preview_vl .prose, #md_preview_doc .prose { line-height: 1.7 !important; }
403
- #md_preview_vl .prose img, #md_preview_doc .prose img { display: block; margin: 0 auto; max-width: 100%; height: auto; }
404
- .notice { margin: 8px auto 0; max-width: 900px; padding: 10px 12px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f8fafc; font-size: 14px; line-height: 1.6; }
405
- .notice strong { font-weight: 700; }
406
- .notice a { color: #3b82f6; text-decoration: none; }
407
- .notice a:hover { text-decoration: underline; }
408
- .checkbox-row .gradio-checkbox { flex-grow: 1; text-align: center; }
409
- """
410
-
411
- with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
412
- logo_data_url = image_to_base64_data_url(LOGO_IMAGE_PATH) if os.path.exists(LOGO_IMAGE_PATH) else ""
413
- gr.HTML(f"""<div class="app-header"><img src="{logo_data_url}" alt="App Logo" style="max-height:10%; width: auto; margin: 10px auto; display: block;"></div>""")
414
- gr.HTML("""<div class="notice"><strong>Heads up:</strong> The Hugging Face demo can be slow at times. For a faster experience, please try <a href="https://aistudio.baidu.com/application/detail/98365" target="_blank" rel="noopener noreferrer">Baidu AI Studio</a> or <a href="https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary" target="_blank" rel="noopener noreferrer">ModelScope</a>.</div>""")
415
-
416
- gr.HTML("""<div class="quick-links"><a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">GitHub</a> | <a href="https://ernie.baidu.com/blog/publication/PaddleOCR-VL_Technical_Report.pdf" target="_blank">Technical Report</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL" target="_blank">Model</a> | <a href="https://aistudio.baidu.com/paddleocr" target="_blank">Official Website</a></div>""")
417
-
418
- with gr.Tabs():
419
- # ===================== Tab 1: Document Parsing =====================
420
- with gr.Tab("Document Parsing"):
421
- with gr.Row():
422
- with gr.Column(scale=5):
423
- file_doc = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
424
- preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
425
- gr.Markdown("_( Use this mode for recognizing full-page documents with structured layouts, such as reports, papers, or magazines.)_")
426
- gr.Markdown("💡 *To recognize a single, pre-cropped element (e.g., a table or formula), switch to the 'Element-level Recognition' tab for better results.*")
427
-
428
- example_url_doc = gr.State(value=None)
429
-
430
- with gr.Row(variant="panel"):
431
- with gr.Column(scale=2):
432
- btn_parse = gr.Button("Parse Document", variant="primary")
433
- with gr.Column(scale=3):
434
- with gr.Row(elem_classes=["checkbox-row"]):
435
- chart_parsing_switch = gr.Checkbox(label="Enable chart parsing", value=False, min_width=10)
436
- doc_unwarping_switch = gr.Checkbox(label="Enable document unwarping", value=False, min_width=10)
437
- doc_orientation_switch = gr.Checkbox(label="Enable orientation classification", value=False, min_width=10)
438
-
439
- if complex_document_examples:
440
- complex_paths = [e[0] for e in complex_document_examples]
441
- complex_state = gr.State(complex_paths)
442
-
443
- gallery_complex = gr.Gallery(
444
- value=complex_paths, columns=4, height=400,
445
- preview=False, label="Example Documents (Select to Load)", allow_preview=False
446
- )
447
-
448
- def on_gallery_select_for_doc(paths, evt: gr.SelectData):
449
- idx = evt.index
450
- if isinstance(idx, (list, tuple)):
451
- idx = idx[0]
452
- try:
453
- url = paths[int(idx)]
454
- except Exception:
455
- raise gr.Error(f"Invalid index from gallery: {evt.index}")
456
-
457
- return url, update_preview_visibility(url)
458
-
459
- gallery_complex.select(
460
- fn=on_gallery_select_for_doc,
461
- inputs=[complex_state],
462
- outputs=[example_url_doc, preview_doc_html],
463
- )
464
-
465
- gr.Markdown("""
466
- <div class="notice">
467
- <h3>History Updates</h3>
468
- <ul>
469
- <li><strong>Nov 4, 2025:</strong> Application converted to run PaddleOCR inference locally (integrated mode), removing API dependency.</li>
470
- <li><strong>Oct 30, 2025:</strong> Added two advanced control options under the "Document Parsing" tab.</li>
471
- <li><strong>Oct 16, 2025:</strong> Initial release of the demo.</li>
472
- </ul>
473
- </div>
474
- """)
475
-
476
- with gr.Column(scale=7):
477
- with gr.Tabs():
478
- with gr.Tab("Markdown Preview"):
479
- md_preview_doc = gr.Markdown("Please upload an image and click 'Parse Document'.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_doc")
480
- with gr.Tab("Visualization"):
481
- vis_image_doc = gr.HTML(label="Detection Visualization", elem_id="vis_image_doc")
482
- with gr.Tab("Markdown Source"):
483
- md_raw_doc = gr.Code(label="Markdown Source Code", language="markdown")
484
-
485
- def on_file_doc_change(fp):
486
- return None, update_preview_visibility(fp)
487
-
488
- file_doc.change(fn=on_file_doc_change, inputs=[file_doc], outputs=[example_url_doc, preview_doc_html])
489
-
490
- def parse_doc_router(fp, example_url, use_chart, use_unwarping, use_orientation):
491
- src = fp if fp else example_url
492
- if not src:
493
- raise gr.Error("Please upload an image or pick an example first.")
494
- return handle_complex_doc(src, use_chart, use_unwarping, use_orientation)
495
-
496
- btn_parse.click(fn=parse_doc_router, inputs=[file_doc, example_url_doc, chart_parsing_switch, doc_unwarping_switch, doc_orientation_switch],
497
- outputs=[md_preview_doc, vis_image_doc, md_raw_doc])
498
-
499
- # ===================== Tab 2: Element-level Recognition =====================
500
- with gr.Tab("Element-level Recognition"):
501
- with gr.Row():
502
- with gr.Column(scale=5):
503
- file_vl = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
504
- preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
505
- gr.Markdown("_(Best for images with a **simple, single-column layout** (e.g., pure text), or for a **pre-cropped single element** like a table, formula, or chart.)_")
506
- gr.Markdown("Choose a recognition type:")
507
-
508
- with gr.Row(elem_classes=["prompt-grid"]):
509
- btn_ocr = gr.Button("Text Recognition", variant="secondary")
510
- btn_formula = gr.Button("Formula Recognition", variant="secondary")
511
- with gr.Row(elem_classes=["prompt-grid"]):
512
- btn_table = gr.Button("Table Recognition", variant="secondary")
513
- btn_chart = gr.Button("Chart Recognition", variant="secondary")
514
-
515
- example_url_vl = gr.State(value=None)
516
-
517
- if targeted_recognition_examples:
518
- targeted_paths = [e[0] for e in targeted_recognition_examples]
519
- targeted_state = gr.State(targeted_paths)
520
-
521
- gallery_targeted = gr.Gallery(
522
- value=targeted_paths, columns=4, height=400,
523
- preview=False, label="Example Elements (Select to Load)", allow_preview=False
524
- )
525
-
526
- def on_gallery_select_for_vl(paths, evt: gr.SelectData):
527
- idx = evt.index
528
- if isinstance(idx, (list, tuple)):
529
- idx = idx[0]
530
- try:
531
- url = paths[int(idx)]
532
- except Exception:
533
- raise gr.Error(f"Invalid index from gallery: {evt.index}")
534
- return url, update_preview_visibility(url)
535
-
536
- gallery_targeted.select(
537
- fn=on_gallery_select_for_vl,
538
- inputs=[targeted_state],
539
- outputs=[example_url_vl, preview_vl_html],
540
- )
541
-
542
- with gr.Column(scale=7):
543
- with gr.Tabs():
544
- with gr.Tab("Recognition Result"):
545
- md_preview_vl = gr.Markdown("Please upload an image and click a recognition type.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_vl")
546
- with gr.Tab("Raw Output"):
547
- md_raw_vl = gr.Code(label="Raw Output", language="markdown")
548
-
549
- def on_file_vl_change(fp):
550
- return None, update_preview_visibility(fp)
551
-
552
- file_vl.change(fn=on_file_vl_change, inputs=[file_vl], outputs=[example_url_vl, preview_vl_html])
553
-
554
- def parse_vl_router(fp, example_url, prompt_choice):
555
- src = fp if fp else example_url
556
- if not src:
557
- raise gr.Error("Please upload an image or pick an example first.")
558
- return handle_targeted_recognition(src, prompt_choice)
559
-
560
- btn_ocr.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Text Recognition")], outputs=[md_preview_vl, md_raw_vl])
561
- btn_formula.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Formula Recognition")], outputs=[md_preview_vl, md_raw_vl])
562
- btn_table.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Table Recognition")], outputs=[md_preview_vl, md_raw_vl])
563
- btn_chart.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Chart Recognition")], outputs=[md_preview_vl, md_raw_vl])
564
-
565
-
566
- # ===================== Tab 3: PDF & Structured Extraction (NEW) =====================
567
- with gr.Tab("PDF & Structured Extraction"):
568
- gr.Markdown("## 📑 PDF Bounding Box & LaTeX Extractor")
569
- gr.Markdown("Upload a PDF to extract structured elements, visualize bounding boxes, and retrieve LaTeX code (Formulas) on a per-page basis.")
570
-
571
- with gr.Row():
572
- with gr.Column(scale=5):
573
- file_pdf = gr.File(label="Upload PDF", file_count="single", type="filepath", file_types=[".pdf"], elem_id="file_pdf_input")
574
- preview_pdf_html = gr.HTML(value="", elem_id="image_preview_pdf", visible=False)
575
-
576
- page_selector = gr.Slider(
577
- minimum=0, maximum=0, step=1, value=0, label="Select Page (0-indexed)", interactive=False
578
- )
579
-
580
- btn_extract_boxes = gr.Button("Extract Bounding Boxes & LaTeX", variant="primary")
581
-
582
- with gr.Column(scale=7):
583
- with gr.Tabs():
584
- with gr.Tab("Image with Bounding Boxes"):
585
- box_vis_html = gr.HTML(label="Bounding Box Visualization", elem_id="box_vis_html", value="Upload a PDF and click the button to see the result.")
586
- with gr.Tab("Extracted LaTeX"):
587
- latex_output = gr.Markdown(label="Extracted LaTeX/Formulas", elem_id="latex_output", value="No LaTeX extracted yet.")
588
- with gr.Tab("Raw Structured Data"):
589
- raw_json_output = gr.Code(label="Raw Structured Output (JSON)", language="json", elem_id="raw_json_output")
590
-
591
- # Logic for PDF input
592
- def on_file_pdf_change(fp):
593
- # Update page selector when a new PDF is uploaded
594
- page_update = get_pdf_page_count(fp)
595
- # Update preview
596
- preview_update = update_preview_visibility(fp)
597
- return page_update, preview_update
598
-
599
- file_pdf.change(
600
- fn=on_file_pdf_change,
601
- inputs=[file_pdf],
602
- outputs=[page_selector, preview_pdf_html]
603
- )
604
-
605
- # Logic for processing
606
- btn_extract_boxes.click(
607
- fn=handle_structured_extraction,
608
- inputs=[file_pdf, page_selector],
609
- outputs=[box_vis_html, latex_output, raw_json_output]
610
- )
611
-
612
- if __name__ == "__main__":
613
- port = int(os.getenv("PORT", "7860"))
614
- # Use queue() for better handling of long-running model inference
615
- demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=port, share=False)
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pytesseract
3
+ from PIL import Image
 
4
  from pdf2image import convert_from_path
5
+ import os
6
+ import tempfile
7
 
8
+ # ----------------------------------------------------------------------
9
+ # 1. OCR Core Function
10
+ # ----------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def perform_ocr_on_pdf(pdf_file_path, language="eng"):
 
 
 
 
 
 
 
13
  """
14
+ Converts a PDF file to images and performs OCR on each page.
15
 
16
+ Args:
17
+ pdf_file_path (str): The file path to the uploaded PDF.
18
+ language (str): The Tesseract language code (e.g., 'eng', 'fra+deu').
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ Returns:
21
+ str: The combined extracted text from all PDF pages.
22
+ """
23
+ if pdf_file_path is None:
24
+ return "Please upload a PDF file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ extracted_text = []
27
 
 
 
28
  try:
29
+ # 1. Convert PDF pages to PIL images (requires poppler-utils, installed via Dockerfile)
30
+ # Setting a high DPI (300) improves OCR accuracy for scanned documents.
31
+ images = convert_from_path(pdf_file_path, dpi=300)
32
+
33
+ # 2. Iterate through each page image and perform OCR
34
+ for i, image in enumerate(images):
35
+ # Using tempfile to save the image is sometimes necessary for pytesseract,
36
+ # though convert_from_path often returns PIL objects directly.
37
+ # We'll use the PIL object directly for efficiency.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # Perform OCR on the image
40
+ page_text = pytesseract.image_to_string(image, lang=language)
41
 
42
+ extracted_text.append(f"--- PAGE {i+1} ---\n{page_text}\n")
 
43
 
44
+ return "\n".join(extracted_text)
 
 
 
 
 
 
 
45
 
46
+ except pytesseract.TesseractNotFoundError:
47
+ return "Error: Tesseract is not installed or not in PATH. This should be handled by the Dockerfile."
 
 
 
48
  except Exception as e:
49
+ return f"An error occurred during OCR processing: {str(e)}"
50
+
51
+ # ----------------------------------------------------------------------
52
+ # 2. Gradio Interface
53
+ # ----------------------------------------------------------------------
54
+
55
+ # Define the supported languages for the dropdown
56
+ LANGUAGES = {
57
+ "English": "eng",
58
+ "Spanish": "spa",
59
+ "French": "fra",
60
+ "German": "deu",
61
+ "Japanese": "jpn",
62
+ "Chinese (Simplified)": "chi_sim"
63
+ }
64
+
65
+ # Create the Gradio interface components
66
+ pdf_input = gr.File(
67
+ label="Upload PDF Document",
68
+ file_types=[".pdf"],
69
+ type="filepath",
70
+ interactive=True
71
+ )
72
+
73
+ lang_dropdown = gr.Dropdown(
74
+ label="Select OCR Language",
75
+ choices=list(LANGUAGES.keys()),
76
+ value="English",
77
+ type="value",
78
+ interactive=True
79
+ )
80
+
81
+ ocr_output = gr.Textbox(
82
+ label="Extracted Text (Output)",
83
+ lines=25,
84
+ max_lines=30,
85
+ show_copy_button=True,
86
+ placeholder="Extracted text will appear here...",
87
+ )
88
+
89
+ # Custom wrapper to map the dropdown name back to the Tesseract code
90
+ def lang_wrapper(file_path, lang_name):
91
+ lang_code = LANGUAGES.get(lang_name, "eng")
92
+ return perform_ocr_on_pdf(file_path, lang_code)
93
+
94
+ # Create the Gradio Interface
95
+ gr.Interface(
96
+ fn=lang_wrapper,
97
+ inputs=[pdf_input, lang_dropdown],
98
+ outputs=ocr_output,
99
+ title="PDF Optical Character Recognition (OCR) App",
100
+ description=(
101
+ "Upload a PDF file to extract text from it using Tesseract OCR. "
102
+ "Select the primary language to improve accuracy. "
103
+ "Note: Requires Tesseract and Poppler system dependencies."
104
+ ),
105
+ allow_flagging="never",
106
+ theme=gr.themes.Soft(primary_hue="blue").set(
107
+ body_background_fill="#f5f7fa",
108
+ background_fill_primary="#ffffff",
109
+ shadow_drop_lg="0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05)",
110
+ )
111
+ ).launch(server_name="0.0.0.0", server_port=7860)