heerjtdev commited on
Commit
f547ea0
·
verified ·
1 Parent(s): 5c7cada

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +614 -642
app.py CHANGED
@@ -1,643 +1,615 @@
1
- import base64
2
- import io
3
- import json
4
- import os
5
- from typing import Dict, List, Tuple, Any, Optional
6
- import time
7
- import requests
8
- from PIL import Image, ImageDraw, ImageFont # Added ImageDraw, ImageFont
9
- import gradio as gr
10
- import re
11
- import tempfile
12
- from urllib.parse import urlparse
13
- from pdf2image import convert_from_path
14
- import numpy as np # For potential image manipulation
15
-
16
- # =========================
17
- # Config
18
- # =========================
19
- DEFAULT_API_URL = os.environ.get("API_URL")
20
- TOKEN = os.environ.get("TOKEN")
21
- LOGO_IMAGE_PATH = "./assets/logo.jpg"
22
- GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@400;700&display=swap' rel='stylesheet'>"
23
- LATEX_DELIMS = [
24
- {"left": "$$", "right": "$$", "display": True},
25
- {"left": "$", "right": "$", "display": False},
26
- {"left": "\\(", "right": "\\)", "display": False},
27
- {"left": "\\[", "right": "\\]", "display": True},
28
- ]
29
- AUTH_HEADER = {"Authorization": f"bearer {TOKEN}"} if TOKEN else {}
30
- JSON_HEADERS = {**AUTH_HEADER, "Content-Type": "application/json"} if AUTH_HEADER else {"Content-Type": "application/json"}
31
-
32
- # Placeholder for Bounding Box API URL (You MUST define this)
33
- BOUNDING_BOX_API_URL = os.environ.get("BB_API_URL", DEFAULT_API_URL)
34
-
35
- # =========================
36
- # Base64 & Examples (URL直链渲染)
37
- # =========================
38
- def image_to_base64_data_url(filepath: str) -> str:
39
- """仅用于本地上传预览的兼容方案;URL 预览不会用到它。"""
40
- try:
41
- ext = os.path.splitext(filepath)[1].lower()
42
- mime_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"}
43
- mime_type = mime_types.get(ext, "image/jpeg")
44
- with open(filepath, "rb") as image_file:
45
- encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
46
- return f"data:{mime_type};base64,{encoded_string}"
47
- except Exception as e:
48
- print(f"Error encoding image to Base64: {e}")
49
- return ""
50
-
51
- def _escape_inequalities_in_math(md: str) -> str:
52
- """把数学块中的 < > 替换为 \\lt \\gt,避免被 Markdown 误解析。"""
53
- _MATH_PATTERNS = [
54
- re.compile(r"\$\$([\s\S]+?)\$\$"),
55
- re.compile(r"\$([^\$]+?)\$"),
56
- re.compile(r"\\\[([\s\S]+?)\\\]"),
57
- re.compile(r"\\\(([\s\S]+?)\\\)"),
58
- ]
59
-
60
- def fix(s: str) -> str:
61
- s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
62
- s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
63
- s = s.replace("<", r" \lt ").replace(">", r" \gt ")
64
- return s
65
-
66
- for pat in _MATH_PATTERNS:
67
- md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
68
- return md
69
-
70
- def _get_examples_from_dir(dir_path: str) -> List[List[str]]:
71
- """
72
- 从本地目录读取文件名,拼出远程直链 URL(不下载、不转码),用于 <img src="URL"> 直接渲染。
73
- 你原来使用的 BOS 基础路径保留。
74
- """
75
- BASE_URL = "https://paddle-model-ecology.bj.bcebos.com/PPOCRVL/dataset/examples"
76
- supported_exts = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
77
- examples = []
78
- if not os.path.exists(dir_path):
79
- print(f"Warning: example dir {dir_path} not found.")
80
- return []
81
- for filename in sorted(os.listdir(dir_path)):
82
- ext = os.path.splitext(filename)[1].lower()
83
- if ext in supported_exts:
84
- subdir = os.path.basename(dir_path.rstrip("/"))
85
- img_url = f"{BASE_URL}/{subdir}/{filename}"
86
- examples.append([img_url])
87
- return examples
88
-
89
- TARGETED_EXAMPLES_DIR = "examples/targeted"
90
- COMPLEX_EXAMPLES_DIR = "examples/complex"
91
- targeted_recognition_examples = _get_examples_from_dir(TARGETED_EXAMPLES_DIR)
92
- complex_document_examples = _get_examples_from_dir(COMPLEX_EXAMPLES_DIR)
93
-
94
- # =========================
95
- # UI Helpers(URL直链渲染)
96
- # =========================
97
- def render_uploaded_image_div(path_or_url: str) -> str:
98
- """
99
- 支持两种输入:
100
- - 远程 URL:直接用 <img src="URL"> 渲染
101
- - 本地文件:为兼容旧逻辑,依然转 data: URL 预览(也可以改为 File 组件,这里先保持一致)
102
- """
103
- if not path_or_url:
104
- return ""
105
- is_url = isinstance(path_or_url, str) and path_or_url.startswith(("http://", "https://"))
106
- if is_url:
107
- src = path_or_url # 直接远程URL
108
- else:
109
- # Check for PDF to avoid attempting Base64 conversion on large files
110
- if path_or_url.lower().endswith('.pdf'):
111
- return f"""<div style="text-align:center; padding: 20px; color:#888;">PDF file uploaded. Click 'Extract Bounding Boxes & LaTeX' to process page 0.</div>"""
112
- src = image_to_base64_data_url(path_or_url) # 本地上传时的兼容
113
-
114
- return f"""
115
- <div class="uploaded-image">
116
- <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
117
- </div>
118
- """
119
-
120
- def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
121
- if path_or_url:
122
- html_content = render_uploaded_image_div(path_or_url)
123
- return gr.update(value=html_content, visible=True)
124
- else:
125
- return gr.update(value="", visible=False)
126
-
127
- # =========================
128
- # API 调用逻辑(支持URL或本地文件)
129
- # =========================
130
- def _file_to_b64_image_only(path_or_url: str) -> Tuple[str, int]:
131
- """
132
- 输入可以是本地文件路径或远程URL。
133
- - URL:仅在发请求给后端时下载字节转Base64(不影响前端渲染)。
134
- - 本地:读取文件字节。
135
- """
136
- if not path_or_url:
137
- raise ValueError("Please upload an image first.")
138
-
139
- is_url = isinstance(path_or_url, str) and path_or_url.startswith(("http://", "https://"))
140
- content: bytes
141
- if is_url:
142
- r = requests.get(path_or_url, timeout=600)
143
- r.raise_for_status()
144
- content = r.content
145
- ext = os.path.splitext(urlparse(path_or_url).path)[1].lower()
146
- else:
147
- ext = os.path.splitext(path_or_url)[1].lower()
148
- with open(path_or_url, "rb") as f:
149
- content = f.read()
150
-
151
- # 放宽后缀限制:有些URL可能没有后缀,这里仅在极端情况下提示
152
- supported = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
153
- if ext and (ext not in supported):
154
- print(f"Warning: file extension {ext} not in supported set {supported}, continue anyway.")
155
-
156
- return base64.b64encode(content).decode("utf-8"), 1 # 1 = image 类型
157
-
158
- def _call_api(api_url: str, path_or_url: str, use_layout_detection: bool,
159
- prompt_label: Optional[str], use_chart_recognition: bool = False,
160
- use_doc_unwarping: bool = True, use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
161
- b64, file_type = _file_to_b64_image_only(path_or_url)
162
- payload = {
163
- "file": b64,
164
- "useLayoutDetection": bool(use_layout_detection),
165
- "fileType": file_type,
166
- "useDocUnwarping": use_doc_unwarping,
167
- "useDocOrientationClassify": use_doc_orientation_classify
168
- }
169
- if not use_layout_detection:
170
- if not prompt_label:
171
- raise ValueError("Please select a recognition type.")
172
- payload["promptLabel"] = prompt_label.strip().lower()
173
- if use_layout_detection and use_chart_recognition:
174
- payload["useChartRecognition"] = True
175
-
176
- try:
177
- print(f"Sending API request to {api_url}...")
178
- start_time = time.time()
179
- resp = requests.post(api_url, json=payload, headers=JSON_HEADERS, timeout=600)
180
- end_time = time.time()
181
- print(f"Received API response in {end_time - start_time:.2f} seconds.")
182
- resp.raise_for_status()
183
- data = resp.json()
184
- except requests.exceptions.RequestException as e:
185
- raise gr.Error(f"API request failed: {e}")
186
- except json.JSONDecodeError:
187
- raise gr.Error(f"Invalid JSON response from server:\n{getattr(resp, 'text', '')}")
188
-
189
- if data.get("errorCode", -1) != 0:
190
- raise gr.Error("API returned an error:")
191
- return data
192
-
193
- def _process_api_response_page(result: Dict[str, Any]) -> Tuple[str, str, str]:
194
- """
195
- 处理后端返回结果:
196
- 1) 把 markdown 里的占位图路径替换为真实URL
197
- 2) 构造一个可视化<img>(如果有)
198
- """
199
- layout_results = (result or {}).get("layoutParsingResults", [])
200
- if not layout_results:
201
- return "No content was recognized.", "<p>No visualization available.</p>", ""
202
-
203
- page0 = layout_results[0] or {}
204
- md_data = page0.get("markdown") or {}
205
- md_text = md_data.get("text", "") or ""
206
- md_images_map = md_data.get("images", {})
207
-
208
- if md_images_map:
209
- for placeholder_path, image_url in md_images_map.items():
210
- md_text = md_text.replace(f'src="{placeholder_path}"', f'src="{image_url}"') \
211
- .replace(f']({placeholder_path})', f']({image_url})')
212
-
213
- output_html = "<p style='text-align:center; color:#888;'>No visualization image available.</p>"
214
- out_imgs = page0.get("outputImages") or {}
215
- sorted_urls = [img_url for _, img_url in sorted(out_imgs.items()) if img_url]
216
-
217
- output_image_url: Optional[str] = None
218
- if len(sorted_urls) >= 2:
219
- output_image_url = sorted_urls[1]
220
- elif sorted_urls:
221
- output_image_url = sorted_urls[0]
222
-
223
- if output_image_url:
224
- print(f"Found visualization image URL: {output_image_url}")
225
- output_html = f'<img src="{output_image_url}" alt="Detection Visualization" loading="lazy">'
226
-
227
- md_text = _escape_inequalities_in_math(md_text)
228
- return md_text or "(Empty result)", output_html, md_text
229
-
230
- def handle_complex_doc(path_or_url: str, use_chart_recognition: bool, use_doc_unwarping: bool, use_doc_orientation_classify: bool) -> Tuple[str, str, str]:
231
- if not path_or_url:
232
- raise gr.Error("Please upload an image first.")
233
- data = _call_api(DEFAULT_API_URL, path_or_url, use_layout_detection=True,
234
- prompt_label=None, use_chart_recognition=use_chart_recognition,
235
- use_doc_unwarping=use_doc_unwarping, use_doc_orientation_classify=use_doc_orientation_classify)
236
- result = data.get("result", {})
237
- return _process_api_response_page(result)
238
-
239
- def handle_targeted_recognition(path_or_url: str, prompt_choice: str) -> Tuple[str, str]:
240
- if not path_or_url:
241
- raise gr.Error("Please upload an image first.")
242
- mapping = {
243
- "Text Recognition": "ocr",
244
- "Formula Recognition": "formula",
245
- "Table Recognition": "table",
246
- "Chart Recognition": "chart",
247
- }
248
- label = mapping.get(prompt_choice, "ocr")
249
- data = _call_api(DEFAULT_API_URL, path_or_url, use_layout_detection=False, prompt_label=label, use_doc_unwarping=False, use_doc_orientation_classify=False)
250
- result = data.get("result", {})
251
- md_preview, _, md_raw = _process_api_response_page(result)
252
- return md_preview, md_raw
253
-
254
- # =========================================================
255
- # NEW LOGIC FOR PDF, BOUNDING BOXES, AND LATEX EXTRACTION
256
- # =========================================================
257
-
258
- def _pdf_to_page_image(pdf_path: str, page_num: int) -> Image.Image:
259
- """Converts a specific PDF page to a PIL Image."""
260
- try:
261
- # Use a high DPI for better quality OCR/BB detection
262
- pages = convert_from_path(pdf_path, dpi=300, first_page=page_num + 1, last_page=page_num + 1)
263
- if not pages:
264
- raise ValueError(f"Could not convert page {page_num} of PDF.")
265
- return pages[0]
266
- except Exception as e:
267
- raise gr.Error(f"Error processing PDF with pdf2image (Is Poppler installed?): {e}")
268
-
269
- def _draw_boxes_on_image(img: Image.Image, boxes_data: List[Dict]) -> str:
270
- """Draws bounding boxes onto the PIL Image and returns a data URL."""
271
- draw = ImageDraw.Draw(img)
272
-
273
- # You might need to adjust the font path based on your system
274
- try:
275
- font = ImageFont.truetype("arial.ttf", 16)
276
- except IOError:
277
- font = ImageFont.load_default()
278
-
279
- for item in boxes_data:
280
- # Expected bbox format: [x1, y1, x2, y2]
281
- bbox = item.get("bbox", [])
282
- text = item.get("text", "")
283
- item_type = item.get("type", "text")
284
-
285
- if len(bbox) == 4:
286
- x1, y1, x2, y2 = bbox
287
-
288
- if item_type == "formula":
289
- color = "red"
290
- draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=3)
291
- elif item_type == "table":
292
- color = "blue"
293
- draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=3)
294
- else: # Text/Other
295
- color = "green"
296
- draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=1)
297
-
298
- # Optionally draw text label
299
- draw.text((x1 + 5, y1 - 15), text[:20] + "...", fill=color, font=font)
300
-
301
- # Convert the resulting image back to a Base64 Data URL for Gradio HTML display
302
- buffered = io.BytesIO()
303
- img.save(buffered, format="PNG")
304
- img_str = base64.b64encode(buffered.getvalue()).decode()
305
- return f"data:image/png;base64,{img_str}"
306
-
307
- def handle_structured_extraction(pdf_path: Optional[str], page_num: int) -> Tuple[str, str, str]:
308
- if not pdf_path:
309
- raise gr.Error("Please upload a PDF file.")
310
- if not pdf_path.lower().endswith('.pdf'):
311
- raise gr.Error("File must be a PDF for this tab.")
312
-
313
- print(f"Processing PDF: {pdf_path}, Page: {page_num}")
314
-
315
- # --- 1. Convert PDF Page to Image ---
316
- try:
317
- page_img = _pdf_to_page_image(pdf_path, page_num)
318
- except Exception as e:
319
- return f"Error: {e}", "", json.dumps({"error": str(e)}, indent=2)
320
-
321
-
322
- # --- 2. Call Bounding Box API (Requires your backend implementation) ---
323
- # This is a conceptual call. Your API needs to handle the image input and return
324
- # the structured data, including bounding boxes and LaTeX.
325
-
326
- temp_img_path = tempfile.mktemp(suffix=".png")
327
- page_img.save(temp_img_path)
328
-
329
- try:
330
- # Use the file path for b64 conversion
331
- b64_img, _ = _file_to_b64_image_only(temp_img_path)
332
-
333
- payload = {
334
- "file": b64_img,
335
- "page_num": page_num, # Optional: may help the backend context
336
- "extract_mode": "structured_boxes" # Custom key for your backend
337
- }
338
-
339
- # NOTE: Using a separate, custom BB_API_URL is highly recommended
340
- resp = requests.post(BOUNDING_BOX_API_URL, json=payload, headers=JSON_HEADERS, timeout=600)
341
- resp.raise_for_status()
342
- structured_data = resp.json()
343
-
344
- except requests.exceptions.RequestException as e:
345
- return (f"API request failed (BB Extraction): {e}",
346
- "",
347
- json.dumps({"error": f"API request failed: {str(e)}", "url": BOUNDING_BOX_API_URL}, indent=2))
348
- finally:
349
- if os.path.exists(temp_img_path):
350
- os.remove(temp_img_path)
351
-
352
-
353
- # --- 3. Process Structured Data ---
354
-
355
- # Expected structure from your API:
356
- # structured_data = {
357
- # "elements": [
358
- # {"text": "The equation is:", "bbox": [100, 100, 500, 120], "type": "text"},
359
- # {"text": r"E=mc^2", "bbox": [200, 150, 400, 200], "type": "formula", "latex": r"$E=mc^2$"},
360
- # # ... other elements ...
361
- # ]
362
- # }
363
-
364
- elements = structured_data.get("elements", [])
365
-
366
- # Extract all formulas into a single LaTeX block
367
- all_latex = [
368
- item.get("latex") for item in elements
369
- if item.get("type") in ["formula", "equation"] and item.get("latex")
370
- ]
371
-
372
- latex_output = "\n\n".join(all_latex) if all_latex else "No formulas (LaTeX) found on this page."
373
-
374
- # --- 4. Draw Boxes for Visualization ---
375
- box_html = f'<img src="{_draw_boxes_on_image(page_img, elements)}" alt="Image with Bounding Boxes" loading="lazy">'
376
-
377
- # --- 5. Return Results ---
378
- return box_html, latex_output, json.dumps(structured_data, indent=2)
379
-
380
- def get_pdf_page_count(pdf_path):
381
- """Placeholder to get the total number of pages in the PDF."""
382
- if not pdf_path or not pdf_path.lower().endswith('.pdf'):
383
- return gr.update(maximum=0, value=0, interactive=False)
384
- try:
385
- # Quick way to get the number of pages using a temporary call
386
- images = convert_from_path(pdf_path, last_page=1, first_page=1, use_pdftocairo=True)
387
- # This is a bit inefficient, a proper PDF library like PyMuPDF (fitz) is better for metadata.
388
- # Fallback to a brute-force count for simplicity if metadata read fails:
389
- pages = convert_from_path(pdf_path, last_page=50, use_pdftocairo=True)
390
- count = len(pages)
391
- if count == 50:
392
- print("Warning: Page count capped at 50 for performance.")
393
- return gr.update(maximum=max(0, count - 1), value=0, interactive=True)
394
- except Exception as e:
395
- print(f"Warning: Could not determine PDF page count: {e}")
396
- return gr.update(maximum=0, value=0, interactive=False)
397
-
398
-
399
- # =========================
400
- # CSS & UI
401
- # =========================
402
- custom_css = """
403
- body, .gradio-container { font-family: "Noto Sans SC", "Microsoft YaHei", "PingFang SC", sans-serif; }
404
- .app-header { text-align: center; max-width: 900px; margin: 0 auto 8px !important; }
405
- .gradio-container { padding: 4px 0 !important; }
406
- .gradio-container [data-testid="tabs"], .gradio-container .tabs { margin-top: 0 !important; }
407
- .gradio-container [data-testid="tabitem"], .gradio-container .tabitem { padding-top: 4px !important; }
408
- .quick-links { text-align: center; padding: 8px 0; border: 1px solid #e5e7eb; border-radius: 8px; margin: 8px auto; max-width: 900px; }
409
- .quick-links a { margin: 0 12px; font-size: 14px; font-weight: 600; color: #3b82f6; text-decoration: none; }
410
- .quick-links a:hover { text-decoration: underline; }
411
- .prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
412
- .prompt-grid button { height: 40px !important; padding: 0 12px !important; border-radius: 8px !important; font-weight: 600 !important; font-size: 13px !important; letter-spacing: 0.2px; }
413
- #image_preview_vl, #image_preview_doc, #image_preview_pdf { height: 400px !important; overflow: auto; }
414
- #image_preview_vl img, #image_preview_doc img, #vis_image_doc img, #box_vis_html img { width: 100% !important; height: auto !important; object-fit: contain !important; display: block; }
415
- #md_preview_vl, #md_preview_doc { max-height: 540px; min-height: 180px; overflow: auto; scrollbar-gutter: stable both-edges; }
416
- #md_preview_vl .prose, #md_preview_doc .prose { line-height: 1.7 !important; }
417
- #md_preview_vl .prose img, #md_preview_doc .prose img { display: block; margin: 0 auto; max-width: 100%; height: auto; }
418
- .notice { margin: 8px auto 0; max-width: 900px; padding: 10px 12px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f8fafc; font-size: 14px; line-height: 1.6; }
419
- .notice strong { font-weight: 700; }
420
- .notice a { color: #3b82f6; text-decoration: none; }
421
- .notice a:hover { text-decoration: underline; }
422
- .checkbox-row .gradio-checkbox { flex-grow: 1; text-align: center; }
423
- """
424
-
425
- with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
426
- logo_data_url = image_to_base64_data_url(LOGO_IMAGE_PATH) if os.path.exists(LOGO_IMAGE_PATH) else ""
427
- gr.HTML(f"""<div class="app-header"><img src="{logo_data_url}" alt="App Logo" style="max-height:10%; width: auto; margin: 10px auto; display: block;"></div>""")
428
- gr.HTML("""<div class="notice"><strong>Heads up:</strong> The Hugging Face demo can be slow at times. For a faster experience, please try <a href="https://aistudio.baidu.com/application/detail/98365" target="_blank" rel="noopener noreferrer">Baidu AI Studio</a> or <a href="https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary" target="_blank" rel="noopener noreferrer">ModelScope</a>.</div>""")
429
-
430
- gr.HTML("""<div class="quick-links"><a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">GitHub</a> | <a href="https://ernie.baidu.com/blog/publication/PaddleOCR-VL_Technical_Report.pdf" target="_blank">Technical Report</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL" target="_blank">Model</a> | <a href="https://aistudio.baidu.com/paddleocr" target="_blank">Official Website</a></div>""")
431
-
432
- with gr.Tabs():
433
- # ===================== Tab 1: Document Parsing =====================
434
- with gr.Tab("Document Parsing"):
435
- with gr.Row():
436
- with gr.Column(scale=5):
437
- file_doc = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
438
- preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
439
- gr.Markdown("_( Use this mode for recognizing full-page documents with structured layouts, such as reports, papers, or magazines.)_")
440
- gr.Markdown("💡 *To recognize a single, pre-cropped element (e.g., a table or formula), switch to the 'Element-level Recognition' tab for better results.*")
441
-
442
- example_url_doc = gr.State(value=None)
443
-
444
- with gr.Row(variant="panel"):
445
- with gr.Column(scale=2):
446
- btn_parse = gr.Button("Parse Document", variant="primary")
447
- with gr.Column(scale=3):
448
- with gr.Row(elem_classes=["checkbox-row"]):
449
- chart_parsing_switch = gr.Checkbox(label="Enable chart parsing", value=False, min_width=10)
450
- doc_unwarping_switch = gr.Checkbox(label="Enable document unwarping", value=False, min_width=10)
451
- doc_orientation_switch = gr.Checkbox(label="Enable orientation classification", value=False, min_width=10)
452
-
453
- if complex_document_examples:
454
- complex_paths = [e[0] for e in complex_document_examples]
455
- complex_state = gr.State(complex_paths)
456
-
457
- gallery_complex = gr.Gallery(
458
- value=complex_paths, columns=4, height=400,
459
- preview=False, label="Example Documents (Select to Load)", allow_preview=False
460
- )
461
-
462
- def on_gallery_select_for_doc(paths, evt: gr.SelectData):
463
- idx = evt.index
464
- if isinstance(idx, (list, tuple)):
465
- idx = idx[0]
466
- try:
467
- url = paths[int(idx)]
468
- except Exception:
469
- raise gr.Error(f"Invalid index from gallery: {evt.index}")
470
-
471
- return url, update_preview_visibility(url)
472
-
473
- gallery_complex.select(
474
- fn=on_gallery_select_for_doc,
475
- inputs=[complex_state],
476
- outputs=[example_url_doc, preview_doc_html],
477
- )
478
-
479
- # ===================== 更新日志模块 =====================
480
- gr.Markdown("""
481
- <div class="notice">
482
- <h3>History Updates</h3>
483
- <ul>
484
- <li>
485
- <strong>Oct 30, 2025:</strong>
486
- Added two advanced control options under the "Document Parsing" tab. These features were previously enabled by default (set to true) but are now user-configurable and default to false.
487
- <ul>
488
- <li><strong>Enable document unwarping:</strong> Corrects distortions in bent or poorly photographed documents.</li>
489
- <li><strong>Enable orientation classification:</strong> Automatically corrects the orientation of rotated or upside-down images.</li>
490
- </ul>
491
- </li>
492
- <li>
493
- <strong>Oct 16, 2025:</strong> Initial release of the demo.
494
- </li>
495
- </ul>
496
- </div>
497
- """)
498
-
499
- with gr.Column(scale=7):
500
- with gr.Tabs():
501
- with gr.Tab("Markdown Preview"):
502
- md_preview_doc = gr.Markdown("Please upload an image and click 'Parse Document'.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_doc")
503
- with gr.Tab("Visualization"):
504
- vis_image_doc = gr.HTML(label="Detection Visualization", elem_id="vis_image_doc")
505
- with gr.Tab("Markdown Source"):
506
- md_raw_doc = gr.Code(label="Markdown Source Code", language="markdown")
507
-
508
- def on_file_doc_change(fp):
509
- return None, update_preview_visibility(fp)
510
-
511
- file_doc.change(fn=on_file_doc_change, inputs=[file_doc], outputs=[example_url_doc, preview_doc_html])
512
-
513
- def parse_doc_router(fp, example_url, use_chart, use_unwarping, use_orientation):
514
- src = fp if fp else example_url
515
- if not src:
516
- raise gr.Error("Please upload an image or pick an example first.")
517
- return handle_complex_doc(src, use_chart, use_unwarping, use_orientation)
518
-
519
- btn_parse.click(fn=parse_doc_router, inputs=[file_doc, example_url_doc, chart_parsing_switch, doc_unwarping_switch, doc_orientation_switch],
520
- outputs=[md_preview_doc, vis_image_doc, md_raw_doc])
521
-
522
- # ===================== Tab 2: Element-level Recognition =====================
523
- with gr.Tab("Element-level Recognition"):
524
- with gr.Row():
525
- with gr.Column(scale=5):
526
- file_vl = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
527
- preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
528
- gr.Markdown("_(Best for images with a **simple, single-column layout** (e.g., pure text), or for a **pre-cropped single element** like a table, formula, or chart.)_")
529
- gr.Markdown("Choose a recognition type:")
530
-
531
- with gr.Row(elem_classes=["prompt-grid"]):
532
- btn_ocr = gr.Button("Text Recognition", variant="secondary")
533
- btn_formula = gr.Button("Formula Recognition", variant="secondary")
534
- with gr.Row(elem_classes=["prompt-grid"]):
535
- btn_table = gr.Button("Table Recognition", variant="secondary")
536
- btn_chart = gr.Button("Chart Recognition", variant="secondary")
537
-
538
- example_url_vl = gr.State(value=None)
539
-
540
- if targeted_recognition_examples:
541
- targeted_paths = [e[0] for e in targeted_recognition_examples]
542
- targeted_state = gr.State(targeted_paths)
543
-
544
- gallery_targeted = gr.Gallery(
545
- value=targeted_paths, columns=4, height=400,
546
- preview=False, label="Example Elements (Select to Load)", allow_preview=False
547
- )
548
-
549
- def on_gallery_select_for_vl(paths, evt: gr.SelectData):
550
- idx = evt.index
551
- if isinstance(idx, (list, tuple)):
552
- idx = idx[0]
553
- try:
554
- url = paths[int(idx)]
555
- except Exception:
556
- raise gr.Error(f"Invalid index from gallery: {evt.index}")
557
- return url, update_preview_visibility(url)
558
-
559
- gallery_targeted.select(
560
- fn=on_gallery_select_for_vl,
561
- inputs=[targeted_state],
562
- outputs=[example_url_vl, preview_vl_html],
563
- )
564
-
565
- with gr.Column(scale=7):
566
- with gr.Tabs():
567
- with gr.Tab("Recognition Result"):
568
- md_preview_vl = gr.Markdown("Please upload an image and click a recognition type.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_vl")
569
- with gr.Tab("Raw Output"):
570
- md_raw_vl = gr.Code(label="Raw Output", language="markdown")
571
-
572
- def on_file_vl_change(fp):
573
- return None, update_preview_visibility(fp)
574
-
575
- file_vl.change(fn=on_file_vl_change, inputs=[file_vl], outputs=[example_url_vl, preview_vl_html])
576
-
577
- def parse_vl_router(fp, example_url, prompt_choice):
578
- src = fp if fp else example_url
579
- if not src:
580
- raise gr.Error("Please upload an image or pick an example first.")
581
- return handle_targeted_recognition(src, prompt_choice)
582
-
583
- btn_ocr.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Text Recognition")], outputs=[md_preview_vl, md_raw_vl])
584
- btn_formula.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Formula Recognition")], outputs=[md_preview_vl, md_raw_vl])
585
- btn_table.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Table Recognition")], outputs=[md_preview_vl, md_raw_vl])
586
- btn_chart.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Chart Recognition")], outputs=[md_preview_vl, md_raw_vl])
587
-
588
-
589
- # ===================== Tab 3: PDF & Structured Extraction (NEW) =====================
590
- with gr.Tab("PDF & Structured Extraction"):
591
- gr.Markdown("## 📑 PDF Bounding Box & LaTeX Extractor (New Feature)")
592
- gr.Markdown("Upload a PDF to extract word-level bounding boxes and full LaTeX code for formulas.")
593
-
594
- with gr.Row():
595
- with gr.Column(scale=5):
596
- file_pdf = gr.File(label="Upload PDF", file_count="single", type="filepath", file_types=[".pdf"], elem_id="file_pdf_input")
597
- preview_pdf_html = gr.HTML(value="", elem_id="image_preview_pdf", visible=False)
598
-
599
- page_selector = gr.Slider(
600
- minimum=0, maximum=0, step=1, value=0, label="Select Page (0-indexed)", interactive=False
601
- )
602
-
603
- btn_extract_boxes = gr.Button("Extract Bounding Boxes & LaTeX", variant="primary")
604
-
605
- with gr.Column(scale=7):
606
- with gr.Tabs():
607
- with gr.Tab("Image with Bounding Boxes"):
608
- box_vis_html = gr.HTML(label="Bounding Box Visualization", elem_id="box_vis_html", value="Upload a PDF and click the button to see the result.")
609
- with gr.Tab("Extracted LaTeX"):
610
- latex_output = gr.Markdown(label="Extracted LaTeX/Formulas", elem_id="latex_output", value="No LaTeX extracted yet.")
611
- with gr.Tab("Raw Structured Data"):
612
- raw_json_output = gr.Code(label="Raw Structured Output (JSON)", language="json", elem_id="raw_json_output")
613
-
614
- # Logic for PDF input
615
- def on_file_pdf_change(fp):
616
- # Update page selector when a new PDF is uploaded
617
- page_update = get_pdf_page_count(fp)
618
- # Update preview
619
- preview_update = update_preview_visibility(fp)
620
- return fp, page_update, preview_update
621
-
622
- file_pdf.change(
623
- fn=on_file_pdf_change,
624
- inputs=[file_pdf],
625
- outputs=[gr.State(value=None), page_selector, preview_pdf_html]
626
- )
627
-
628
- # Logic for processing
629
- btn_extract_boxes.click(
630
- fn=handle_structured_extraction,
631
- inputs=[file_pdf, page_selector],
632
- outputs=[box_vis_html, latex_output, raw_json_output]
633
- )
634
- # End of new tab
635
-
636
- if __name__ == "__main__":
637
- if not DEFAULT_API_URL:
638
- print("Warning: API_URL environment variable is not set. Demo will likely fail when calling the backend.")
639
- if not BOUNDING_BOX_API_URL:
640
- print("Warning: BB_API_URL is also not set. Structured extraction will fail.")
641
-
642
- port = int(os.getenv("PORT", "7860"))
643
  demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=port, share=False)
 
1
+ import base64
2
+ import io
3
+ import json
4
+ import os
5
+ import re
6
+ import time
7
+ import tempfile
8
+ from typing import Dict, List, Tuple, Any, Optional
9
+ from urllib.parse import urlparse
10
+
11
+ import gradio as gr
12
+ import numpy as np
13
+ import requests
14
+ from PIL import Image, ImageDraw, ImageFont
15
+ from pdf2image import convert_from_path
16
+
17
+ # --- PADDLEOCR INTEGRATION ---
18
+ try:
19
+ from paddleocr import PPStructureV3, draw_structure_result
20
+ # Initialize the model globally once to avoid re-loading on every call
21
+ # This uses the default layout and table recognition models (PP-StructureV3).
22
+ # Setting show_log=False keeps the console clean.
23
+ PADDLE_STRUCTURE_PIPELINE = PPStructureV3(
24
+ layout=True,
25
+ table=True,
26
+ ocr=True,
27
+ show_log=False
28
+ )
29
+ print("✅ Paddle Structure Model Initialized for Integrated Inference.")
30
+ except ImportError:
31
+ PADDLE_STRUCTURE_PIPELINE = None
32
+ print("❌ PaddleOCR/PPStructureV3 not found. Inference will be disabled.")
33
+ except Exception as e:
34
+ PADDLE_STRUCTURE_PIPELINE = None
35
+ print(f"❌ Error initializing PaddleOCR pipeline: {e}")
36
+
37
+
38
+ # =========================
39
+ # Config (API URLs are now obsolete but kept for reference)
40
+ # =========================
41
+ # DEFAULT_API_URL = os.environ.get("API_URL") # OBSOLETE
42
+ # TOKEN = os.environ.get("TOKEN") # OBSOLETE
43
+ LOGO_IMAGE_PATH = "./assets/logo.jpg"
44
+ GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@400;700&display=swap' rel='stylesheet'>"
45
+ LATEX_DELIMS = [
46
+ {"left": "$$", "right": "$$", "display": True},
47
+ {"left": "$", "right": "$", "display": False},
48
+ {"left": "\\(", "right": "\\)", "display": False},
49
+ {"left": "\\[", "right": "\\]", "display": True},
50
+ ]
51
+ # AUTH_HEADER and JSON_HEADERS are OBSOLETE but kept for file structure consistency
52
+ AUTH_HEADER = {}
53
+ JSON_HEADERS = {}
54
+
55
+
56
+ # =========================
57
+ # Utility Functions
58
+ # =========================
59
+
60
+ def _ensure_local_path(path_or_url: str) -> str:
61
+ """Ensures the input is a local file path, downloading from URL if necessary."""
62
+ if not path_or_url:
63
+ raise ValueError("Input path is empty.")
64
+
65
+ is_url = path_or_url.startswith(("http://", "https://"))
66
+ if not is_url:
67
+ return path_or_url # Already local file
68
+
69
+ # Download remote URL to a temporary file
70
+ try:
71
+ r = requests.get(path_or_url, timeout=600)
72
+ r.raise_for_status()
73
+
74
+ # Use filename extension if available, otherwise default to .jpg
75
+ ext = os.path.splitext(urlparse(path_or_url).path)[1].lower() or '.jpg'
76
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
77
+ temp_file.write(r.content)
78
+ temp_file.close()
79
+ return temp_file.name
80
+ except Exception as e:
81
+ raise gr.Error(f"Error downloading image from URL: {e}")
82
+
83
+
84
+ def image_to_base64_data_url(filepath: str) -> str:
85
+ """Encodes a local image file to a Base64 data URL for HTML rendering."""
86
+ try:
87
+ # Prevent conversion attempt on PDFs which can be huge
88
+ if filepath.lower().endswith('.pdf'):
89
+ return ""
90
+
91
+ ext = os.path.splitext(filepath)[1].lower()
92
+ mime_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"}
93
+ mime_type = mime_types.get(ext, "image/jpeg")
94
+ with open(filepath, "rb") as image_file:
95
+ encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
96
+ return f"data:{mime_type};base64,{encoded_string}"
97
+ except Exception as e:
98
+ # print(f"Error encoding image to Base64: {e}")
99
+ return ""
100
+
101
+ def _to_html_img(pil_img: Image.Image) -> str:
102
+ """Converts a PIL Image to a Base64 data URL string for HTML display."""
103
+ buffered = io.BytesIO()
104
+ pil_img.save(buffered, format="PNG")
105
+ img_str = base64.b64encode(buffered.getvalue()).decode()
106
+ return f'data:image/png;base64,{img_str}'
107
+
108
+
109
+ def _escape_inequalities_in_math(md: str) -> str:
110
+ """Escapes < and > inside math blocks to prevent markdown misinterpretation."""
111
+ _MATH_PATTERNS = [
112
+ re.compile(r"\$\$([\s\S]+?)\$\$"),
113
+ re.compile(r"\$([^\$]+?)\$"),
114
+ re.compile(r"\\\[([\s\S]+?)\\\]"),
115
+ re.compile(r"\\\(([\s\S]+?)\\\)"),
116
+ ]
117
+
118
+ def fix(s: str) -> str:
119
+ s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
120
+ s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
121
+ s = s.replace("<", r" \lt ").replace(">", r" \gt ")
122
+ return s
123
+
124
+ for pat in _MATH_PATTERNS:
125
+ md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
126
+ return md
127
+
128
+ def _get_examples_from_dir(dir_path: str) -> List[List[str]]:
129
+ """Loads example URLs (unchanged)."""
130
+ BASE_URL = "https://paddle-model-ecology.bj.bcebos.com/PPOCRVL/dataset/examples"
131
+ supported_exts = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
132
+ examples = []
133
+ if not os.path.exists(dir_path):
134
+ print(f"Warning: example dir {dir_path} not found.")
135
+ return []
136
+ for filename in sorted(os.listdir(dir_path)):
137
+ ext = os.path.splitext(filename)[1].lower()
138
+ if ext in supported_exts:
139
+ subdir = os.path.basename(dir_path.rstrip("/"))
140
+ img_url = f"{BASE_URL}/{subdir}/{filename}"
141
+ examples.append([img_url])
142
+ return examples
143
+
144
+ TARGETED_EXAMPLES_DIR = "examples/targeted"
145
+ COMPLEX_EXAMPLES_DIR = "examples/complex"
146
+ targeted_recognition_examples = _get_examples_from_dir(TARGETED_EXAMPLES_DIR)
147
+ complex_document_examples = _get_examples_from_dir(COMPLEX_EXAMPLES_DIR)
148
+
149
+ # =========================
150
+ # UI Helpers
151
+ # =========================
152
+ def render_uploaded_image_div(path_or_url: str) -> str:
153
+ """Renders the image or a PDF placeholder."""
154
+ if not path_or_url:
155
+ return ""
156
+
157
+ is_url = path_or_url.startswith(("http://", "https://"))
158
+ is_pdf = path_or_url.lower().endswith('.pdf')
159
+
160
+ if is_pdf:
161
+ return f"""<div style="text-align:center; padding: 20px; color:#888;">PDF file loaded. Use the page selector and click 'Extract...' to process.</div>"""
162
+
163
+ src = path_or_url if is_url else image_to_base64_data_url(path_or_url)
164
+ if not src:
165
+ return "" # Handle case where local image B64 conversion failed
166
+
167
+ return f"""
168
+ <div class="uploaded-image">
169
+ <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
170
+ </div>
171
+ """
172
+
173
+ def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
174
+ if path_or_url:
175
+ html_content = render_uploaded_image_div(path_or_url)
176
+ return gr.update(value=html_content, visible=True)
177
+ else:
178
+ return gr.update(value="", visible=False)
179
+
180
+
181
+ # =========================
182
+ # Core Inference Logic (Replaces API Calls)
183
+ # =========================
184
+
185
+ def _run_paddle_structure(local_path: str, is_doc_parsing: bool = True) -> Tuple[str, str, str]:
186
+ """Runs PPStructureV3 prediction and formats the results."""
187
+
188
+ if PADDLE_STRUCTURE_PIPELINE is None:
189
+ raise gr.Error("PaddleOCR model is not loaded. Please check model initialization logs.")
190
+
191
+ start_time = time.time()
192
+
193
+ # 1. Run prediction
194
+ # Note: PPStructureV3 processes images, not PDFs. local_path should be an image path.
195
+ result_list = PADDLE_STRUCTURE_PIPELINE.predict(local_path)
196
+
197
+ end_time = time.time()
198
+ print(f"PaddleOCR Structure inference completed in {end_time - start_time:.2f} seconds.")
199
+
200
+ if not result_list:
201
+ return "No content recognized.", "<p>No visualization available.</p>", "{}"
202
+
203
+ # We only process the first page/image in the list
204
+ result = result_list[0]
205
+
206
+ # 2. Markdown Output
207
+ # PPStructureV3 can generate LaTeX/Markdown based on its components.
208
+ # This is a simplification; full VL-model output formatting is complex.
209
+ md_text = result.to_markdown()
210
+
211
+ # 3. Visualization Image
212
+ image = Image.open(local_path).convert('RGB')
213
+ # draw_structure_result requires a system font (e.g., simfang.ttf or arial.ttf) to be accessible.
214
+ try:
215
+ vis_image = draw_structure_result(image, result, font_path="arial.ttf")
216
+ except Exception:
217
+ # Fallback if font isn't found
218
+ vis_image = draw_structure_result(image, result)
219
+
220
+ output_html = f'<img src="{_to_html_img(vis_image)}" alt="Detection Visualization" loading="lazy">'
221
+
222
+ # 4. Raw JSON Output
223
+ raw_json = json.dumps(result.to_dict(), indent=2, ensure_ascii=False)
224
+
225
+ md_text = _escape_inequalities_in_math(md_text)
226
+ return md_text or "(Empty result)", output_html, raw_json
227
+
228
+ # --- Inference Handlers for Tabs 1 & 2 ---
229
+
230
+ def handle_complex_doc(path_or_url: str, use_chart_recognition: bool, use_doc_unwarping: bool, use_doc_orientation_classify: bool) -> Tuple[str, str, str]:
231
+ if not path_or_url:
232
+ raise gr.Error("Please upload an image first.")
233
+
234
+ local_path = _ensure_local_path(path_or_url)
235
+ if local_path.lower().endswith('.pdf'):
236
+ raise gr.Error("Document Parsing tab requires an image, not a PDF.")
237
+
238
+ # Note: The switches (chart, unwarping, orientation) are ignored here because
239
+ # the integrated PPStructureV3 pipeline does not expose simple toggles for them.
240
+ # The complexity is handled internally by the model version loaded.
241
+
242
+ return _run_paddle_structure(local_path, is_doc_parsing=True)
243
+
244
+
245
+ def handle_targeted_recognition(path_or_url: str, prompt_choice: str) -> Tuple[str, str]:
246
+ if not path_or_url:
247
+ raise gr.Error("Please upload an image first.")
248
+
249
+ local_path = _ensure_local_path(path_or_url)
250
+ if local_path.lower().endswith('.pdf'):
251
+ raise gr.Error("Element-level Recognition tab requires an image, not a PDF.")
252
+
253
+ # Map the choice to the desired structure/recognition type (simplified mapping)
254
+ mapping = {
255
+ "Text Recognition": "text",
256
+ "Formula Recognition": "formula",
257
+ "Table Recognition": "table",
258
+ "Chart Recognition": "chart",
259
+ }
260
+ target_type = mapping.get(prompt_choice, "text")
261
+
262
+ # For integrated PPStructureV3, we run a full structure pass and let the model's
263
+ # internal logic prioritize the recognition based on the input image content.
264
+ md_preview, _, md_raw = _run_paddle_structure(local_path, is_doc_parsing=False)
265
+
266
+ # In a real VL system, we'd use the 'prompt_choice' to focus the model output.
267
+ # Here, we just return the full markdown and raw output.
268
+
269
+ return md_preview, md_raw
270
+
271
+
272
+ # --- Inference Handler for Tab 3: PDF & Structured Extraction ---
273
+
274
+ def _pdf_to_page_image(pdf_path: str, page_num: int) -> Image.Image:
275
+ """Converts a specific PDF page to a PIL Image."""
276
+ try:
277
+ pages = convert_from_path(pdf_path, dpi=300, first_page=page_num + 1, last_page=page_num + 1)
278
+ if not pages:
279
+ raise ValueError(f"Could not convert page {page_num} of PDF.")
280
+ return pages[0]
281
+ except Exception as e:
282
+ raise gr.Error(f"Error processing PDF with pdf2image (Is Poppler installed?): {e}")
283
+
284
+ def _draw_boxes_on_image(img: Image.Image, elements: List[Dict]) -> str:
285
+ """Draws bounding boxes onto the PIL Image based on PPStructureV3 results."""
286
+ draw = ImageDraw.Draw(img)
287
+
288
+ try:
289
+ # Use a common font or fall back
290
+ font = ImageFont.truetype("arial.ttf", 16)
291
+ except IOError:
292
+ font = ImageFont.load_default()
293
+
294
+ for item in elements:
295
+ # The coordinates are expected in the format [x1, y1, x2, y2]
296
+ bbox = item.get("box", []) # PPStructureV3 often uses 'box' key
297
+ item_type = item.get("type", "text")
298
+
299
+ if len(bbox) == 4:
300
+ x1, y1, x2, y2 = bbox
301
+
302
+ # Draw different colors for different types
303
+ if item_type in ["figure", "title"]:
304
+ color = "purple"
305
+ width = 3
306
+ elif item_type in ["table", "formula"]:
307
+ color = "red"
308
+ width = 2
309
+ else: # text
310
+ color = "green"
311
+ width = 1
312
+
313
+ draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=width)
314
+
315
+ # Optional: Add type label
316
+ # draw.text((x1 + 5, y1 - 15), item_type, fill=color, font=font)
317
+
318
+ return _to_html_img(img)
319
+
320
+ def handle_structured_extraction(pdf_path: Optional[str], page_num: int) -> Tuple[str, str, str]:
321
+ if PADDLE_STRUCTURE_PIPELINE is None:
322
+ raise gr.Error("PaddleOCR model is not loaded.")
323
+
324
+ if not pdf_path or not pdf_path.lower().endswith('.pdf'):
325
+ raise gr.Error("Please upload a PDF file for this feature.")
326
+
327
+ print(f"Processing PDF: {pdf_path}, Page: {page_num}")
328
+
329
+ # --- 1. Convert PDF Page to Image ---
330
+ try:
331
+ page_img = _pdf_to_page_image(pdf_path, page_num)
332
+ except Exception as e:
333
+ return f"Error: {e}", "Error during PDF conversion.", json.dumps({"error": str(e)}, indent=2)
334
+
335
+ # --- 2. Save image to temp file for PPStructureV3 ---
336
+ temp_img_path = tempfile.mktemp(suffix=".png")
337
+ page_img.save(temp_img_path)
338
+
339
+ try:
340
+ # --- 3. Run PPStructureV3 inference ---
341
+ result_list = PADDLE_STRUCTURE_PIPELINE.predict(temp_img_path)
342
+
343
+ if not result_list:
344
+ return "No content recognized on this PDF page.", "", "{}"
345
+
346
+ # --- 4. Process Results ---
347
+ result = result_list[0]
348
+ elements = result.to_dict().get("res", [])
349
+
350
+ # Extract LaTeX/Formulas
351
+ all_latex = []
352
+ for item in elements:
353
+ if item.get("type") == "formula" and item.get("text"):
354
+ # Wrap text with $$. PPStructureV3 often outputs raw LaTeX in the 'text' field.
355
+ all_latex.append(f"$${item['text']}$$")
356
+
357
+ latex_output = "\n\n".join(all_latex) if all_latex else "No formulas (LaTeX) found on this page."
358
+
359
+ # --- 5. Draw Boxes for Visualization ---
360
+ box_html = f'<img src="{_draw_boxes_on_image(page_img, elements)}" alt="Image with Bounding Boxes" loading="lazy">'
361
+
362
+ # --- 6. Return Results ---
363
+ return box_html, latex_output, json.dumps(result.to_dict(), indent=2, ensure_ascii=False)
364
+
365
+ except Exception as e:
366
+ raise gr.Error(f"PaddleOCR inference failed during PDF processing: {e}")
367
+ finally:
368
+ if os.path.exists(temp_img_path):
369
+ os.remove(temp_img_path)
370
+
371
+ def get_pdf_page_count(pdf_path):
372
+ """Gets the total number of pages in the PDF."""
373
+ if not pdf_path or not pdf_path.lower().endswith('.pdf'):
374
+ return gr.update(maximum=0, value=0, interactive=False)
375
+ try:
376
+ # Load the whole PDF to get the exact count (inefficient but reliable with pdf2image)
377
+ pages = convert_from_path(pdf_path, use_pdftocairo=True)
378
+ count = len(pages)
379
+ return gr.update(maximum=max(0, count - 1), value=0, interactive=True)
380
+ except Exception as e:
381
+ print(f"Warning: Could not determine PDF page count: {e}")
382
+ return gr.update(maximum=0, value=0, interactive=False)
383
+
384
+
385
+ # =========================
386
+ # CSS & UI (Unchanged)
387
+ # =========================
388
+ custom_css = """
389
+ body, .gradio-container { font-family: "Noto Sans SC", "Microsoft YaHei", "PingFang SC", sans-serif; }
390
+ .app-header { text-align: center; max-width: 900px; margin: 0 auto 8px !important; }
391
+ .gradio-container { padding: 4px 0 !important; }
392
+ .gradio-container [data-testid="tabs"], .gradio-container .tabs { margin-top: 0 !important; }
393
+ .gradio-container [data-testid="tabitem"], .gradio-container .tabitem { padding-top: 4px !important; }
394
+ .quick-links { text-align: center; padding: 8px 0; border: 1px solid #e5e7eb; border-radius: 8px; margin: 8px auto; max-width: 900px; }
395
+ .quick-links a { margin: 0 12px; font-size: 14px; font-weight: 600; color: #3b82f6; text-decoration: none; }
396
+ .quick-links a:hover { text-decoration: underline; }
397
+ .prompt-grid { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 6px; }
398
+ .prompt-grid button { height: 40px !important; padding: 0 12px !important; border-radius: 8px !important; font-weight: 600 !important; font-size: 13px !important; letter-spacing: 0.2px; }
399
+ #image_preview_vl, #image_preview_doc, #image_preview_pdf { height: 400px !important; overflow: auto; }
400
+ #image_preview_vl img, #image_preview_doc img, #vis_image_doc img, #box_vis_html img { width: 100% !important; height: auto !important; object-fit: contain !important; display: block; }
401
+ #md_preview_vl, #md_preview_doc { max-height: 540px; min-height: 180px; overflow: auto; scrollbar-gutter: stable both-edges; }
402
+ #md_preview_vl .prose, #md_preview_doc .prose { line-height: 1.7 !important; }
403
+ #md_preview_vl .prose img, #md_preview_doc .prose img { display: block; margin: 0 auto; max-width: 100%; height: auto; }
404
+ .notice { margin: 8px auto 0; max-width: 900px; padding: 10px 12px; border: 1px solid #e5e7eb; border-radius: 8px; background: #f8fafc; font-size: 14px; line-height: 1.6; }
405
+ .notice strong { font-weight: 700; }
406
+ .notice a { color: #3b82f6; text-decoration: none; }
407
+ .notice a:hover { text-decoration: underline; }
408
+ .checkbox-row .gradio-checkbox { flex-grow: 1; text-align: center; }
409
+ """
410
+
411
+ with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
412
+ logo_data_url = image_to_base64_data_url(LOGO_IMAGE_PATH) if os.path.exists(LOGO_IMAGE_PATH) else ""
413
+ gr.HTML(f"""<div class="app-header"><img src="{logo_data_url}" alt="App Logo" style="max-height:10%; width: auto; margin: 10px auto; display: block;"></div>""")
414
+ gr.HTML("""<div class="notice"><strong>Heads up:</strong> The Hugging Face demo can be slow at times. For a faster experience, please try <a href="https://aistudio.baidu.com/application/detail/98365" target="_blank" rel="noopener noreferrer">Baidu AI Studio</a> or <a href="https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary" target="_blank" rel="noopener noreferrer">ModelScope</a>.</div>""")
415
+
416
+ gr.HTML("""<div class="quick-links"><a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">GitHub</a> | <a href="https://ernie.baidu.com/blog/publication/PaddleOCR-VL_Technical_Report.pdf" target="_blank">Technical Report</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL" target="_blank">Model</a> | <a href="https://aistudio.baidu.com/paddleocr" target="_blank">Official Website</a></div>""")
417
+
418
+ with gr.Tabs():
419
+ # ===================== Tab 1: Document Parsing =====================
420
+ with gr.Tab("Document Parsing"):
421
+ with gr.Row():
422
+ with gr.Column(scale=5):
423
+ file_doc = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
424
+ preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
425
+ gr.Markdown("_( Use this mode for recognizing full-page documents with structured layouts, such as reports, papers, or magazines.)_")
426
+ gr.Markdown("💡 *To recognize a single, pre-cropped element (e.g., a table or formula), switch to the 'Element-level Recognition' tab for better results.*")
427
+
428
+ example_url_doc = gr.State(value=None)
429
+
430
+ with gr.Row(variant="panel"):
431
+ with gr.Column(scale=2):
432
+ btn_parse = gr.Button("Parse Document", variant="primary")
433
+ with gr.Column(scale=3):
434
+ with gr.Row(elem_classes=["checkbox-row"]):
435
+ chart_parsing_switch = gr.Checkbox(label="Enable chart parsing", value=False, min_width=10)
436
+ doc_unwarping_switch = gr.Checkbox(label="Enable document unwarping", value=False, min_width=10)
437
+ doc_orientation_switch = gr.Checkbox(label="Enable orientation classification", value=False, min_width=10)
438
+
439
+ if complex_document_examples:
440
+ complex_paths = [e[0] for e in complex_document_examples]
441
+ complex_state = gr.State(complex_paths)
442
+
443
+ gallery_complex = gr.Gallery(
444
+ value=complex_paths, columns=4, height=400,
445
+ preview=False, label="Example Documents (Select to Load)", allow_preview=False
446
+ )
447
+
448
+ def on_gallery_select_for_doc(paths, evt: gr.SelectData):
449
+ idx = evt.index
450
+ if isinstance(idx, (list, tuple)):
451
+ idx = idx[0]
452
+ try:
453
+ url = paths[int(idx)]
454
+ except Exception:
455
+ raise gr.Error(f"Invalid index from gallery: {evt.index}")
456
+
457
+ return url, update_preview_visibility(url)
458
+
459
+ gallery_complex.select(
460
+ fn=on_gallery_select_for_doc,
461
+ inputs=[complex_state],
462
+ outputs=[example_url_doc, preview_doc_html],
463
+ )
464
+
465
+ gr.Markdown("""
466
+ <div class="notice">
467
+ <h3>History Updates</h3>
468
+ <ul>
469
+ <li><strong>Nov 4, 2025:</strong> Application converted to run PaddleOCR inference locally (integrated mode), removing API dependency.</li>
470
+ <li><strong>Oct 30, 2025:</strong> Added two advanced control options under the "Document Parsing" tab.</li>
471
+ <li><strong>Oct 16, 2025:</strong> Initial release of the demo.</li>
472
+ </ul>
473
+ </div>
474
+ """)
475
+
476
+ with gr.Column(scale=7):
477
+ with gr.Tabs():
478
+ with gr.Tab("Markdown Preview"):
479
+ md_preview_doc = gr.Markdown("Please upload an image and click 'Parse Document'.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_doc")
480
+ with gr.Tab("Visualization"):
481
+ vis_image_doc = gr.HTML(label="Detection Visualization", elem_id="vis_image_doc")
482
+ with gr.Tab("Markdown Source"):
483
+ md_raw_doc = gr.Code(label="Markdown Source Code", language="markdown")
484
+
485
+ def on_file_doc_change(fp):
486
+ return None, update_preview_visibility(fp)
487
+
488
+ file_doc.change(fn=on_file_doc_change, inputs=[file_doc], outputs=[example_url_doc, preview_doc_html])
489
+
490
+ def parse_doc_router(fp, example_url, use_chart, use_unwarping, use_orientation):
491
+ src = fp if fp else example_url
492
+ if not src:
493
+ raise gr.Error("Please upload an image or pick an example first.")
494
+ return handle_complex_doc(src, use_chart, use_unwarping, use_orientation)
495
+
496
+ btn_parse.click(fn=parse_doc_router, inputs=[file_doc, example_url_doc, chart_parsing_switch, doc_unwarping_switch, doc_orientation_switch],
497
+ outputs=[md_preview_doc, vis_image_doc, md_raw_doc])
498
+
499
+ # ===================== Tab 2: Element-level Recognition =====================
500
+ with gr.Tab("Element-level Recognition"):
501
+ with gr.Row():
502
+ with gr.Column(scale=5):
503
+ file_vl = gr.File(label="Upload Image", file_count="single", type="filepath", file_types=["image"])
504
+ preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
505
+ gr.Markdown("_(Best for images with a **simple, single-column layout** (e.g., pure text), or for a **pre-cropped single element** like a table, formula, or chart.)_")
506
+ gr.Markdown("Choose a recognition type:")
507
+
508
+ with gr.Row(elem_classes=["prompt-grid"]):
509
+ btn_ocr = gr.Button("Text Recognition", variant="secondary")
510
+ btn_formula = gr.Button("Formula Recognition", variant="secondary")
511
+ with gr.Row(elem_classes=["prompt-grid"]):
512
+ btn_table = gr.Button("Table Recognition", variant="secondary")
513
+ btn_chart = gr.Button("Chart Recognition", variant="secondary")
514
+
515
+ example_url_vl = gr.State(value=None)
516
+
517
+ if targeted_recognition_examples:
518
+ targeted_paths = [e[0] for e in targeted_recognition_examples]
519
+ targeted_state = gr.State(targeted_paths)
520
+
521
+ gallery_targeted = gr.Gallery(
522
+ value=targeted_paths, columns=4, height=400,
523
+ preview=False, label="Example Elements (Select to Load)", allow_preview=False
524
+ )
525
+
526
+ def on_gallery_select_for_vl(paths, evt: gr.SelectData):
527
+ idx = evt.index
528
+ if isinstance(idx, (list, tuple)):
529
+ idx = idx[0]
530
+ try:
531
+ url = paths[int(idx)]
532
+ except Exception:
533
+ raise gr.Error(f"Invalid index from gallery: {evt.index}")
534
+ return url, update_preview_visibility(url)
535
+
536
+ gallery_targeted.select(
537
+ fn=on_gallery_select_for_vl,
538
+ inputs=[targeted_state],
539
+ outputs=[example_url_vl, preview_vl_html],
540
+ )
541
+
542
+ with gr.Column(scale=7):
543
+ with gr.Tabs():
544
+ with gr.Tab("Recognition Result"):
545
+ md_preview_vl = gr.Markdown("Please upload an image and click a recognition type.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_vl")
546
+ with gr.Tab("Raw Output"):
547
+ md_raw_vl = gr.Code(label="Raw Output", language="markdown")
548
+
549
+ def on_file_vl_change(fp):
550
+ return None, update_preview_visibility(fp)
551
+
552
+ file_vl.change(fn=on_file_vl_change, inputs=[file_vl], outputs=[example_url_vl, preview_vl_html])
553
+
554
+ def parse_vl_router(fp, example_url, prompt_choice):
555
+ src = fp if fp else example_url
556
+ if not src:
557
+ raise gr.Error("Please upload an image or pick an example first.")
558
+ return handle_targeted_recognition(src, prompt_choice)
559
+
560
+ btn_ocr.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Text Recognition")], outputs=[md_preview_vl, md_raw_vl])
561
+ btn_formula.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Formula Recognition")], outputs=[md_preview_vl, md_raw_vl])
562
+ btn_table.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Table Recognition")], outputs=[md_preview_vl, md_raw_vl])
563
+ btn_chart.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Chart Recognition")], outputs=[md_preview_vl, md_raw_vl])
564
+
565
+
566
+ # ===================== Tab 3: PDF & Structured Extraction (NEW) =====================
567
+ with gr.Tab("PDF & Structured Extraction"):
568
+ gr.Markdown("## 📑 PDF Bounding Box & LaTeX Extractor")
569
+ gr.Markdown("Upload a PDF to extract structured elements, visualize bounding boxes, and retrieve LaTeX code (Formulas) on a per-page basis.")
570
+
571
+ with gr.Row():
572
+ with gr.Column(scale=5):
573
+ file_pdf = gr.File(label="Upload PDF", file_count="single", type="filepath", file_types=[".pdf"], elem_id="file_pdf_input")
574
+ preview_pdf_html = gr.HTML(value="", elem_id="image_preview_pdf", visible=False)
575
+
576
+ page_selector = gr.Slider(
577
+ minimum=0, maximum=0, step=1, value=0, label="Select Page (0-indexed)", interactive=False
578
+ )
579
+
580
+ btn_extract_boxes = gr.Button("Extract Bounding Boxes & LaTeX", variant="primary")
581
+
582
+ with gr.Column(scale=7):
583
+ with gr.Tabs():
584
+ with gr.Tab("Image with Bounding Boxes"):
585
+ box_vis_html = gr.HTML(label="Bounding Box Visualization", elem_id="box_vis_html", value="Upload a PDF and click the button to see the result.")
586
+ with gr.Tab("Extracted LaTeX"):
587
+ latex_output = gr.Markdown(label="Extracted LaTeX/Formulas", elem_id="latex_output", value="No LaTeX extracted yet.")
588
+ with gr.Tab("Raw Structured Data"):
589
+ raw_json_output = gr.Code(label="Raw Structured Output (JSON)", language="json", elem_id="raw_json_output")
590
+
591
+ # Logic for PDF input
592
+ def on_file_pdf_change(fp):
593
+ # Update page selector when a new PDF is uploaded
594
+ page_update = get_pdf_page_count(fp)
595
+ # Update preview
596
+ preview_update = update_preview_visibility(fp)
597
+ return page_update, preview_update
598
+
599
+ file_pdf.change(
600
+ fn=on_file_pdf_change,
601
+ inputs=[file_pdf],
602
+ outputs=[page_selector, preview_pdf_html]
603
+ )
604
+
605
+ # Logic for processing
606
+ btn_extract_boxes.click(
607
+ fn=handle_structured_extraction,
608
+ inputs=[file_pdf, page_selector],
609
+ outputs=[box_vis_html, latex_output, raw_json_output]
610
+ )
611
+
612
+ if __name__ == "__main__":
613
+ port = int(os.getenv("PORT", "7860"))
614
+ # Use queue() for better handling of long-running model inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=port, share=False)