root commited on
Commit
51a671a
·
1 Parent(s): 58c51e0
Files changed (1) hide show
  1. app.py +190 -94
app.py CHANGED
@@ -8,14 +8,15 @@ import requests
8
  from PIL import Image
9
  import gradio as gr
10
  import re
11
-
 
12
 
13
  # =========================
14
  # Config
15
  # =========================
16
  DEFAULT_API_URL = os.environ.get("API_URL")
17
  TOKEN = os.environ.get("TOKEN")
18
- LOGO_IMAGE_PATH = './assets/logo.jpg'
19
  GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@400;700&display=swap' rel='stylesheet'>"
20
  LATEX_DELIMS = [
21
  {"left": "$$", "right": "$$", "display": True},
@@ -23,19 +24,18 @@ LATEX_DELIMS = [
23
  {"left": "\\(", "right": "\\)", "display": False},
24
  {"left": "\\[", "right": "\\]", "display": True},
25
  ]
26
- AUTH_HEADER = {"Authorization": f"bearer {TOKEN}"}
27
- JSON_HEADERS = {**AUTH_HEADER, "Content-Type": "application/json"}
28
-
29
 
30
  # =========================
31
- # Base64 and Example Loading Logic
32
  # =========================
33
  def image_to_base64_data_url(filepath: str) -> str:
34
- """Reads a local image file and encodes it into a Base64 Data URL."""
35
  try:
36
  ext = os.path.splitext(filepath)[1].lower()
37
- mime_types = {'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif'}
38
- mime_type = mime_types.get(ext, 'image/jpeg')
39
  with open(filepath, "rb") as image_file:
40
  encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
41
  return f"data:{mime_type};base64,{encoded_string}"
@@ -44,77 +44,121 @@ def image_to_base64_data_url(filepath: str) -> str:
44
  return ""
45
 
46
  def _escape_inequalities_in_math(md: str) -> str:
47
- """
48
- Finds math blocks in a Markdown string and replaces < and > with
49
- their LaTeX equivalents, \lt and \gt, to prevent markdown parsing errors.
50
- """
51
  _MATH_PATTERNS = [
52
  re.compile(r"\$\$([\s\S]+?)\$\$"),
53
  re.compile(r"\$([^\$]+?)\$"),
54
  re.compile(r"\\\[([\s\S]+?)\\\]"),
55
  re.compile(r"\\\(([\s\S]+?)\\\)"),
56
  ]
 
57
  def fix(s: str) -> str:
58
  s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
59
  s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
60
- s = s.replace("<", r" \lt ").replace(">", r" \gt ")
61
  return s
 
62
  for pat in _MATH_PATTERNS:
63
  md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
64
  return md
65
 
66
  def _get_examples_from_dir(dir_path: str) -> List[List[str]]:
 
 
 
 
 
67
  supported_exts = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
68
  examples = []
69
- if not os.path.exists(dir_path): return []
 
 
70
  for filename in sorted(os.listdir(dir_path)):
71
- if os.path.splitext(filename)[1].lower() in supported_exts:
72
- examples.append([os.path.join(dir_path, filename)])
 
 
 
73
  return examples
74
 
 
 
 
 
 
 
 
 
 
 
75
  TARGETED_EXAMPLES_DIR = "examples/targeted"
76
  COMPLEX_EXAMPLES_DIR = "examples/complex"
77
  targeted_recognition_examples = _get_examples_from_dir(TARGETED_EXAMPLES_DIR)
78
  complex_document_examples = _get_examples_from_dir(COMPLEX_EXAMPLES_DIR)
79
 
80
  # =========================
81
- # UI Helpers
82
  # =========================
83
- def render_uploaded_image_div(file_path: str) -> str:
84
- data_url = image_to_base64_data_url(file_path)
 
 
 
 
 
 
 
 
 
 
 
85
  return f"""
86
  <div class="uploaded-image">
87
- <img src="{data_url}" alt="Uploaded image" style="width:100%;height:100%;object-fit:contain;"/>
88
  </div>
89
  """
90
 
91
- def update_preview_visibility(file_path: Optional[str]) -> Dict:
92
- if file_path:
93
- html_content = render_uploaded_image_div(file_path)
94
  return gr.update(value=html_content, visible=True)
95
  else:
96
  return gr.update(value="", visible=False)
97
 
98
- def _on_gallery_select(example_paths: List[str], evt: gr.SelectData):
99
- try:
100
- idx = evt.index
101
- return example_paths[idx]
102
- except Exception:
103
- return None
104
-
105
  # =========================
106
- # API Call Logic
107
  # =========================
108
- def _file_to_b64_image_only(file_path: str) -> Tuple[str, int]:
109
- if not file_path: raise ValueError("Please upload an image first.")
110
- ext = os.path.splitext(file_path)[1].lower()
111
- if ext not in {".png", ".jpg", ".jpeg", ".bmp", ".webp"}: raise ValueError("Only image files are supported.")
112
- with open(file_path, "rb") as f:
113
- return base64.b64encode(f.read()).decode("utf-8"), 1
114
-
115
- def _call_api(api_url: str, file_path: str, use_layout_detection: bool,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  prompt_label: Optional[str], use_chart_recognition: bool = False) -> Dict[str, Any]:
117
- b64, file_type = _file_to_b64_image_only(file_path)
118
  payload = {
119
  "file": b64,
120
  "useLayoutDetection": bool(use_layout_detection),
@@ -133,13 +177,11 @@ def _call_api(api_url: str, file_path: str, use_layout_detection: bool,
133
  start_time = time.time()
134
  resp = requests.post(api_url, json=payload, headers=JSON_HEADERS, timeout=600)
135
  end_time = time.time()
136
- duration = end_time - start_time
137
- print(f"Received API response in {duration:.2f} seconds.")
138
-
139
  resp.raise_for_status()
140
  data = resp.json()
141
  except requests.exceptions.RequestException as e:
142
- raise gr.Error(f"API request failed:{e}")
143
  except json.JSONDecodeError:
144
  raise gr.Error(f"Invalid JSON response from server:\n{getattr(resp, 'text', '')}")
145
 
@@ -147,68 +189,62 @@ def _call_api(api_url: str, file_path: str, use_layout_detection: bool,
147
  raise gr.Error("API returned an error:")
148
  return data
149
 
150
-
151
  def _process_api_response_page(result: Dict[str, Any]) -> Tuple[str, str, str]:
152
  """
153
- Processes the API response.
154
- 1. Replaces markdown image placeholders with their direct URLs.
155
- 2. Constructs an HTML <img> tag string for the visualization image URL.
156
  """
157
  layout_results = (result or {}).get("layoutParsingResults", [])
158
  if not layout_results:
159
  return "No content was recognized.", "<p>No visualization available.</p>", ""
160
 
161
  page0 = layout_results[0] or {}
162
-
163
- # Step 1: Process Markdown content (unchanged from previous optimization)
164
  md_data = page0.get("markdown") or {}
165
  md_text = md_data.get("text", "") or ""
166
  md_images_map = md_data.get("images", {})
 
167
  if md_images_map:
168
  for placeholder_path, image_url in md_images_map.items():
169
  md_text = md_text.replace(f'src="{placeholder_path}"', f'src="{image_url}"') \
170
  .replace(f']({placeholder_path})', f']({image_url})')
171
 
172
- # 【核心改动点】 Step 2: Process Visualization images by creating an HTML string
173
  output_html = "<p style='text-align:center; color:#888;'>No visualization image available.</p>"
174
  out_imgs = page0.get("outputImages") or {}
175
-
176
- # Get all image URLs and sort them
177
  sorted_urls = [img_url for _, img_url in sorted(out_imgs.items()) if img_url]
178
 
179
- # Logic to select the final visualization image URL
180
  output_image_url: Optional[str] = None
181
  if len(sorted_urls) >= 2:
182
  output_image_url = sorted_urls[1]
183
  elif sorted_urls:
184
  output_image_url = sorted_urls[0]
185
 
186
- # If a URL was found, create the <img> tag
187
  if output_image_url:
188
  print(f"Found visualization image URL: {output_image_url}")
189
- # The CSS will style this `img` tag because of the `#vis_image_doc img` selector
190
- output_html = f'<img src="{output_image_url}" alt="Detection Visualization">'
191
- else:
192
- print("Warning: No visualization image URL found in the API response.")
193
 
194
  md_text = _escape_inequalities_in_math(md_text)
195
  return md_text or "(Empty result)", output_html, md_text
196
 
197
- # =========================
198
- # Handlers
199
- # =========================
200
- def handle_complex_doc(file_path: str, use_chart_recognition: bool) -> Tuple[str, str, str]:
201
- if not file_path: raise gr.Error("Please upload an image first.")
202
- data = _call_api(DEFAULT_API_URL, file_path, use_layout_detection=True, prompt_label=None, use_chart_recognition=use_chart_recognition)
203
  result = data.get("result", {})
204
- # Note the return types now align with the new function signature
205
  return _process_api_response_page(result)
206
 
207
- def handle_targeted_recognition(file_path: str, prompt_choice: str) -> Tuple[str, str]:
208
- if not file_path: raise gr.Error("Please upload an image first.")
209
- mapping = {"Text Recognition": "ocr", "Formula Recognition": "formula", "Table Recognition": "table", "Chart Recognition": "chart"}
 
 
 
 
 
 
210
  label = mapping.get(prompt_choice, "ocr")
211
- data = _call_api(DEFAULT_API_URL, file_path, use_layout_detection=False, prompt_label=label)
212
  result = data.get("result", {})
213
  md_preview, _, md_raw = _process_api_response_page(result)
214
  return md_preview, md_raw
@@ -217,11 +253,7 @@ def handle_targeted_recognition(file_path: str, prompt_choice: str) -> Tuple[str
217
  # CSS & UI
218
  # =========================
219
  custom_css = """
220
- /* 全局字体 */
221
- body, .gradio-container {
222
- font-family: "Noto Sans SC", "Microsoft YaHei", "PingFang SC", sans-serif;
223
- }
224
- /* ... (rest of the CSS is unchanged) ... */
225
  .app-header { text-align: center; max-width: 900px; margin: 0 auto 8px !important; }
226
  .gradio-container { padding: 4px 0 !important; }
227
  .gradio-container [data-testid="tabs"], .gradio-container .tabs { margin-top: 0 !important; }
@@ -243,12 +275,13 @@ body, .gradio-container {
243
  """
244
 
245
  with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
 
246
  logo_data_url = image_to_base64_data_url(LOGO_IMAGE_PATH) if os.path.exists(LOGO_IMAGE_PATH) else ""
247
  gr.HTML(f"""<div class="app-header"><img src="{logo_data_url}" alt="App Logo" style="max-height:10%; width: auto; margin: 10px auto; display: block;"></div>""")
248
- gr.HTML("""<div class="notice"><strong>Heads up:</strong> The Hugging Face demo can be slow at times. For a faster experience, please try <a href="https://aistudio.baidu.com/application/detail/98365" target="_blank" rel="noopener noreferrer">Baidu AI Studio</a> or <a href="https://modelscope.cn/studios/PaddlePaddle/PaddleOCR-VL_Online_Demo/summary" target="_blank" rel="noopener noreferrer">ModelScope</a>.</div>""")
249
- gr.HTML("""<div class="quick-links"><a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">GitHub</a> | <a href="https://ernie.baidu.com/blog/publication/PaddleOCR-VL_Technical_Report.pdf" target="_blank">Technical Report</a> | <a href="https://huggingface.co/PaddlePaddle/PaddleOCR-VL" target="_blank">Model</a></div>""")
250
-
251
  with gr.Tabs():
 
252
  with gr.Tab("Document Parsing"):
253
  with gr.Row():
254
  with gr.Column(scale=5):
@@ -256,29 +289,69 @@ with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as
256
  preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
257
  gr.Markdown("_( Use this mode for recognizing full-page documents with structured layouts, such as reports, papers, or magazines.)_")
258
  gr.Markdown("💡 *To recognize a single, pre-cropped element (e.g., a table or formula), switch to the 'Element-level Recognition' tab for better results.*")
 
 
 
259
  with gr.Row(variant="panel"):
260
  chart_parsing_switch = gr.Checkbox(label="Enable chart parsing", value=False, scale=1)
261
  btn_parse = gr.Button("Parse Document", variant="primary", scale=2)
 
262
  if complex_document_examples:
263
- complex_paths = [e[0] for e in complex_document_examples]
264
  complex_state = gr.State(complex_paths)
265
- gr.Markdown("**Document Examples (Click an image to load)**")
266
- gallery_complex = gr.Gallery(value=complex_paths, columns=4, height=400, preview=False, label=None, allow_preview=False)
267
- gallery_complex.select(fn=_on_gallery_select, inputs=[complex_state], outputs=[file_doc])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  with gr.Column(scale=7):
270
  with gr.Tabs():
271
  with gr.Tab("Markdown Preview"):
272
  md_preview_doc = gr.Markdown("Please upload an image and click 'Parse Document'.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_doc")
273
  with gr.Tab("Visualization"):
274
- # 【核心改动点】: 将 gr.Image 替换为 gr.HTML
275
  vis_image_doc = gr.HTML(label="Detection Visualization", elem_id="vis_image_doc")
276
  with gr.Tab("Markdown Source"):
277
  md_raw_doc = gr.Code(label="Markdown Source Code", language="markdown")
278
 
279
- file_doc.change(fn=update_preview_visibility, inputs=[file_doc], outputs=[preview_doc_html])
280
- btn_parse.click(fn=handle_complex_doc, inputs=[file_doc, chart_parsing_switch], outputs=[md_preview_doc, vis_image_doc, md_raw_doc])
 
 
 
 
 
 
 
 
281
 
 
 
 
 
282
  with gr.Tab("Element-level Recognition"):
283
  with gr.Row():
284
  with gr.Column(scale=5):
@@ -286,18 +359,31 @@ with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as
286
  preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
287
  gr.Markdown("_(Best for images with a **simple, single-column layout** (e.g., pure text), or for a **pre-cropped single element** like a table, formula, or chart.)_")
288
  gr.Markdown("Choose a recognition type:")
 
289
  with gr.Row(elem_classes=["prompt-grid"]):
290
  btn_ocr = gr.Button("Text Recognition", variant="secondary")
291
- btn_formula = gr.Button("Formula Recognition", "secondary")
292
  with gr.Row(elem_classes=["prompt-grid"]):
293
  btn_table = gr.Button("Table Recognition", variant="secondary")
294
  btn_chart = gr.Button("Chart Recognition", variant="secondary")
 
 
 
295
  if targeted_recognition_examples:
296
  targeted_paths = [e[0] for e in targeted_recognition_examples]
297
  targeted_state = gr.State(targeted_paths)
298
  gr.Markdown("**Element-level Recognition Examples (Click an image to load)**")
299
  gallery_targeted = gr.Gallery(value=targeted_paths, columns=4, height=400, preview=False, label=None, allow_preview=False)
300
- gallery_targeted.select(fn=_on_gallery_select, inputs=[targeted_state], outputs=[file_vl])
 
 
 
 
 
 
 
 
 
301
 
302
  with gr.Column(scale=7):
303
  with gr.Tabs():
@@ -306,12 +392,22 @@ with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as
306
  with gr.Tab("Raw Output"):
307
  md_raw_vl = gr.Code(label="Raw Output", language="markdown")
308
 
309
- file_vl.change(fn=update_preview_visibility, inputs=[file_vl], outputs=[preview_vl_html])
310
- btn_ocr.click(fn=handle_targeted_recognition, inputs=[file_vl, gr.State("Text Recognition")], outputs=[md_preview_vl, md_raw_vl])
311
- btn_formula.click(fn=handle_targeted_recognition, inputs=[file_vl, gr.State("Formula Recognition")], outputs=[md_preview_vl, md_raw_vl])
312
- btn_table.click(fn=handle_targeted_recognition, inputs=[file_vl, gr.State("Table Recognition")], outputs=[md_preview_vl, md_raw_vl])
313
- btn_chart.click(fn=handle_targeted_recognition, inputs=[file_vl, gr.State("Chart Recognition")], outputs=[md_preview_vl, md_raw_vl])
 
 
 
 
 
 
 
 
 
 
314
 
315
  if __name__ == "__main__":
316
  port = int(os.getenv("PORT", "7860"))
317
- demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=port,share=False)
 
8
  from PIL import Image
9
  import gradio as gr
10
  import re
11
+ import tempfile
12
+ from urllib.parse import urlparse
13
 
14
  # =========================
15
  # Config
16
  # =========================
17
  DEFAULT_API_URL = os.environ.get("API_URL")
18
  TOKEN = os.environ.get("TOKEN")
19
+ LOGO_IMAGE_PATH = "./assets/logo.jpg"
20
  GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@400;700&display=swap' rel='stylesheet'>"
21
  LATEX_DELIMS = [
22
  {"left": "$$", "right": "$$", "display": True},
 
24
  {"left": "\\(", "right": "\\)", "display": False},
25
  {"left": "\\[", "right": "\\]", "display": True},
26
  ]
27
+ AUTH_HEADER = {"Authorization": f"bearer {TOKEN}"} if TOKEN else {}
28
+ JSON_HEADERS = {**AUTH_HEADER, "Content-Type": "application/json"} if AUTH_HEADER else {"Content-Type": "application/json"}
 
29
 
30
  # =========================
31
+ # Base64 & Examples (URL直链渲染)
32
  # =========================
33
  def image_to_base64_data_url(filepath: str) -> str:
34
+ """仅用于本地上传预览的兼容方案;URL 预览不会用到它。"""
35
  try:
36
  ext = os.path.splitext(filepath)[1].lower()
37
+ mime_types = {".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", ".bmp": "image/bmp"}
38
+ mime_type = mime_types.get(ext, "image/jpeg")
39
  with open(filepath, "rb") as image_file:
40
  encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
41
  return f"data:{mime_type};base64,{encoded_string}"
 
44
  return ""
45
 
46
  def _escape_inequalities_in_math(md: str) -> str:
47
+ """把数学块中的 < > 替换为 \\lt \\gt,避免被 Markdown 误解析。"""
 
 
 
48
  _MATH_PATTERNS = [
49
  re.compile(r"\$\$([\s\S]+?)\$\$"),
50
  re.compile(r"\$([^\$]+?)\$"),
51
  re.compile(r"\\\[([\s\S]+?)\\\]"),
52
  re.compile(r"\\\(([\s\S]+?)\\\)"),
53
  ]
54
+
55
  def fix(s: str) -> str:
56
  s = s.replace("<=", r" \le ").replace(">=", r" \ge ")
57
  s = s.replace("≤", r" \le ").replace("≥", r" \ge ")
58
+ s = s.replace("<", r" \lt ").replace(">", r" \gt ")
59
  return s
60
+
61
  for pat in _MATH_PATTERNS:
62
  md = pat.sub(lambda m: m.group(0).replace(m.group(1), fix(m.group(1))), md)
63
  return md
64
 
65
  def _get_examples_from_dir(dir_path: str) -> List[List[str]]:
66
+ """
67
+ 从本地目录读取文件名,拼出远程直链 URL(不下载、不转码),用于 <img src="URL"> 直接渲染。
68
+ 你原来使用的 BOS 基础路径保留。
69
+ """
70
+ BASE_URL = "https://paddle-model-ecology.bj.bcebos.com/PPOCRVL/dataset/examples"
71
  supported_exts = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
72
  examples = []
73
+ if not os.path.exists(dir_path):
74
+ print(f"Warning: example dir {dir_path} not found.")
75
+ return []
76
  for filename in sorted(os.listdir(dir_path)):
77
+ ext = os.path.splitext(filename)[1].lower()
78
+ if ext in supported_exts:
79
+ subdir = os.path.basename(dir_path.rstrip("/"))
80
+ img_url = f"{BASE_URL}/{subdir}/{filename}"
81
+ examples.append([img_url])
82
  return examples
83
 
84
+ def _on_gallery_select(example_paths: List[str], evt: gr.SelectData):
85
+ """
86
+ 与原版不同:直接返回 URL,不再下载到本地临时文件。
87
+ """
88
+ idx = evt.index
89
+ selected = example_paths[idx]
90
+ if isinstance(selected, list):
91
+ selected = selected[0]
92
+ return selected # 直接是 https://... URL
93
+
94
  TARGETED_EXAMPLES_DIR = "examples/targeted"
95
  COMPLEX_EXAMPLES_DIR = "examples/complex"
96
  targeted_recognition_examples = _get_examples_from_dir(TARGETED_EXAMPLES_DIR)
97
  complex_document_examples = _get_examples_from_dir(COMPLEX_EXAMPLES_DIR)
98
 
99
  # =========================
100
+ # UI Helpers(URL直链渲染)
101
  # =========================
102
+ def render_uploaded_image_div(path_or_url: str) -> str:
103
+ """
104
+ 支持两种输入:
105
+ - 远程 URL:直接用 <img src="URL"> 渲染
106
+ - 本地文件:为兼容旧逻辑,依然转 data: URL 预览(也可以改为 File 组件,这里先保持一致)
107
+ """
108
+ if not path_or_url:
109
+ return ""
110
+ is_url = isinstance(path_or_url, str) and path_or_url.startswith(("http://", "https://"))
111
+ if is_url:
112
+ src = path_or_url # 直接远程URL
113
+ else:
114
+ src = image_to_base64_data_url(path_or_url) # 本地上传时的兼容
115
  return f"""
116
  <div class="uploaded-image">
117
+ <img src="{src}" alt="Preview image" style="width:100%;height:100%;object-fit:contain;" loading="lazy"/>
118
  </div>
119
  """
120
 
121
+ def update_preview_visibility(path_or_url: Optional[str]) -> Dict:
122
+ if path_or_url:
123
+ html_content = render_uploaded_image_div(path_or_url)
124
  return gr.update(value=html_content, visible=True)
125
  else:
126
  return gr.update(value="", visible=False)
127
 
 
 
 
 
 
 
 
128
  # =========================
129
+ # API 调用逻辑(支持URL或本地文件)
130
  # =========================
131
+ def _file_to_b64_image_only(path_or_url: str) -> Tuple[str, int]:
132
+ """
133
+ 输入可以是本地文件路径或远程URL。
134
+ - URL:仅在发请求给后端时下载字节转Base64(不影响前端渲染)。
135
+ - 本地:读取文件字节。
136
+ """
137
+ if not path_or_url:
138
+ raise ValueError("Please upload an image first.")
139
+
140
+ is_url = isinstance(path_or_url, str) and path_or_url.startswith(("http://", "https://"))
141
+ content: bytes
142
+ if is_url:
143
+ r = requests.get(path_or_url, timeout=600)
144
+ r.raise_for_status()
145
+ content = r.content
146
+ ext = os.path.splitext(urlparse(path_or_url).path)[1].lower()
147
+ else:
148
+ ext = os.path.splitext(path_or_url)[1].lower()
149
+ with open(path_or_url, "rb") as f:
150
+ content = f.read()
151
+
152
+ # 放宽后缀限制:有些URL可能没有后缀,这里仅在极端情况下提示
153
+ supported = {".png", ".jpg", ".jpeg", ".bmp", ".webp"}
154
+ if ext and (ext not in supported):
155
+ print(f"Warning: file extension {ext} not in supported set {supported}, continue anyway.")
156
+
157
+ return base64.b64encode(content).decode("utf-8"), 1 # 1 = image 类型
158
+
159
+ def _call_api(api_url: str, path_or_url: str, use_layout_detection: bool,
160
  prompt_label: Optional[str], use_chart_recognition: bool = False) -> Dict[str, Any]:
161
+ b64, file_type = _file_to_b64_image_only(path_or_url)
162
  payload = {
163
  "file": b64,
164
  "useLayoutDetection": bool(use_layout_detection),
 
177
  start_time = time.time()
178
  resp = requests.post(api_url, json=payload, headers=JSON_HEADERS, timeout=600)
179
  end_time = time.time()
180
+ print(f"Received API response in {end_time - start_time:.2f} seconds.")
 
 
181
  resp.raise_for_status()
182
  data = resp.json()
183
  except requests.exceptions.RequestException as e:
184
+ raise gr.Error(f"API request failed: {e}")
185
  except json.JSONDecodeError:
186
  raise gr.Error(f"Invalid JSON response from server:\n{getattr(resp, 'text', '')}")
187
 
 
189
  raise gr.Error("API returned an error:")
190
  return data
191
 
 
192
  def _process_api_response_page(result: Dict[str, Any]) -> Tuple[str, str, str]:
193
  """
194
+ 处理后端返回结果:
195
+ 1) markdown 里的占位图路径替换为真实URL
196
+ 2) 构造一个可视化<img>(如果有)
197
  """
198
  layout_results = (result or {}).get("layoutParsingResults", [])
199
  if not layout_results:
200
  return "No content was recognized.", "<p>No visualization available.</p>", ""
201
 
202
  page0 = layout_results[0] or {}
 
 
203
  md_data = page0.get("markdown") or {}
204
  md_text = md_data.get("text", "") or ""
205
  md_images_map = md_data.get("images", {})
206
+
207
  if md_images_map:
208
  for placeholder_path, image_url in md_images_map.items():
209
  md_text = md_text.replace(f'src="{placeholder_path}"', f'src="{image_url}"') \
210
  .replace(f']({placeholder_path})', f']({image_url})')
211
 
 
212
  output_html = "<p style='text-align:center; color:#888;'>No visualization image available.</p>"
213
  out_imgs = page0.get("outputImages") or {}
 
 
214
  sorted_urls = [img_url for _, img_url in sorted(out_imgs.items()) if img_url]
215
 
 
216
  output_image_url: Optional[str] = None
217
  if len(sorted_urls) >= 2:
218
  output_image_url = sorted_urls[1]
219
  elif sorted_urls:
220
  output_image_url = sorted_urls[0]
221
 
 
222
  if output_image_url:
223
  print(f"Found visualization image URL: {output_image_url}")
224
+ output_html = f'<img src="{output_image_url}" alt="Detection Visualization" loading="lazy">'
 
 
 
225
 
226
  md_text = _escape_inequalities_in_math(md_text)
227
  return md_text or "(Empty result)", output_html, md_text
228
 
229
+ def handle_complex_doc(path_or_url: str, use_chart_recognition: bool) -> Tuple[str, str, str]:
230
+ if not path_or_url:
231
+ raise gr.Error("Please upload an image first.")
232
+ data = _call_api(DEFAULT_API_URL, path_or_url, use_layout_detection=True,
233
+ prompt_label=None, use_chart_recognition=use_chart_recognition)
 
234
  result = data.get("result", {})
 
235
  return _process_api_response_page(result)
236
 
237
+ def handle_targeted_recognition(path_or_url: str, prompt_choice: str) -> Tuple[str, str]:
238
+ if not path_or_url:
239
+ raise gr.Error("Please upload an image first.")
240
+ mapping = {
241
+ "Text Recognition": "ocr",
242
+ "Formula Recognition": "formula",
243
+ "Table Recognition": "table",
244
+ "Chart Recognition": "chart",
245
+ }
246
  label = mapping.get(prompt_choice, "ocr")
247
+ data = _call_api(DEFAULT_API_URL, path_or_url, use_layout_detection=False, prompt_label=label)
248
  result = data.get("result", {})
249
  md_preview, _, md_raw = _process_api_response_page(result)
250
  return md_preview, md_raw
 
253
  # CSS & UI
254
  # =========================
255
  custom_css = """
256
+ body, .gradio-container { font-family: "Noto Sans SC", "Microsoft YaHei", "PingFang SC", sans-serif; }
 
 
 
 
257
  .app-header { text-align: center; max-width: 900px; margin: 0 auto 8px !important; }
258
  .gradio-container { padding: 4px 0 !important; }
259
  .gradio-container [data-testid="tabs"], .gradio-container .tabs { margin-top: 0 !important; }
 
275
  """
276
 
277
  with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
278
+ # 顶部 Logo
279
  logo_data_url = image_to_base64_data_url(LOGO_IMAGE_PATH) if os.path.exists(LOGO_IMAGE_PATH) else ""
280
  gr.HTML(f"""<div class="app-header"><img src="{logo_data_url}" alt="App Logo" style="max-height:10%; width: auto; margin: 10px auto; display: block;"></div>""")
281
+ gr.HTML("""<div class="quick-links"><a href="https://github.com/PaddlePaddle/PaddleOCR" target="_blank">GitHub</a> | <a href="https://ernie.baidu.com/blog/publication/PaddleOCR-VL_Technical_Report.pdf" target="_blank">Technical Report</a> | <a href="https://www.modelscope.cn/models/PaddlePaddle/PaddleOCR-VL" target="_blank">Model</a></div>""")
282
+
 
283
  with gr.Tabs():
284
+ # ===================== Document Parsing =====================
285
  with gr.Tab("Document Parsing"):
286
  with gr.Row():
287
  with gr.Column(scale=5):
 
289
  preview_doc_html = gr.HTML(value="", elem_id="image_preview_doc", visible=False)
290
  gr.Markdown("_( Use this mode for recognizing full-page documents with structured layouts, such as reports, papers, or magazines.)_")
291
  gr.Markdown("💡 *To recognize a single, pre-cropped element (e.g., a table or formula), switch to the 'Element-level Recognition' tab for better results.*")
292
+
293
+ example_url_doc = gr.State(value=None)
294
+
295
  with gr.Row(variant="panel"):
296
  chart_parsing_switch = gr.Checkbox(label="Enable chart parsing", value=False, scale=1)
297
  btn_parse = gr.Button("Parse Document", variant="primary", scale=2)
298
+
299
  if complex_document_examples:
300
+ complex_paths = [e[0] for e in complex_document_examples] # 这里是 List[str]
301
  complex_state = gr.State(complex_paths)
302
+
303
+ gallery_complex = gr.Gallery(
304
+ value=complex_paths, columns=4, height=400,
305
+ preview=False, label=None, allow_preview=False
306
+ )
307
+
308
+ # 2) 回调:用 evt.index 到 paths(State)里取 URL
309
+ def on_gallery_select_for_doc(paths, evt: gr.SelectData):
310
+ # 某些版本 evt.index 可能是 (row, col) 或 list,做个兜底
311
+ idx = evt.index
312
+ if isinstance(idx, (list, tuple)):
313
+ # 常见是一个 int;如果是 (row, col) 形式,通常线性下标 == row
314
+ idx = idx[0]
315
+ try:
316
+ url = paths[int(idx)]
317
+ except Exception:
318
+ raise gr.Error(f"Invalid index from gallery: {evt.index}")
319
+
320
+ # 更新状态 & 预览
321
+ return url, update_preview_visibility(url)
322
+
323
+ # 3) 绑定:把 State 作为 inputs 传给回调,outputs 写入 example_url_doc 和预览 HTML
324
+ gallery_complex.select(
325
+ fn=on_gallery_select_for_doc,
326
+ inputs=[complex_state],
327
+ outputs=[example_url_doc, preview_doc_html],
328
+ )
329
+
330
 
331
  with gr.Column(scale=7):
332
  with gr.Tabs():
333
  with gr.Tab("Markdown Preview"):
334
  md_preview_doc = gr.Markdown("Please upload an image and click 'Parse Document'.", latex_delimiters=LATEX_DELIMS, elem_id="md_preview_doc")
335
  with gr.Tab("Visualization"):
 
336
  vis_image_doc = gr.HTML(label="Detection Visualization", elem_id="vis_image_doc")
337
  with gr.Tab("Markdown Source"):
338
  md_raw_doc = gr.Code(label="Markdown Source Code", language="markdown")
339
 
340
+ def on_file_doc_change(fp):
341
+ return None, update_preview_visibility(fp)
342
+
343
+ file_doc.change(fn=on_file_doc_change, inputs=[file_doc], outputs=[example_url_doc, preview_doc_html])
344
+
345
+ def parse_doc_router(fp, example_url, use_chart):
346
+ src = fp if fp else example_url
347
+ if not src:
348
+ raise gr.Error("Please upload an image or pick an example first.")
349
+ return handle_complex_doc(src, use_chart)
350
 
351
+ btn_parse.click(fn=parse_doc_router, inputs=[file_doc, example_url_doc, chart_parsing_switch],
352
+ outputs=[md_preview_doc, vis_image_doc, md_raw_doc])
353
+
354
+ # ===================== Element-level Recognition =====================
355
  with gr.Tab("Element-level Recognition"):
356
  with gr.Row():
357
  with gr.Column(scale=5):
 
359
  preview_vl_html = gr.HTML(value="", elem_id="image_preview_vl", visible=False)
360
  gr.Markdown("_(Best for images with a **simple, single-column layout** (e.g., pure text), or for a **pre-cropped single element** like a table, formula, or chart.)_")
361
  gr.Markdown("Choose a recognition type:")
362
+
363
  with gr.Row(elem_classes=["prompt-grid"]):
364
  btn_ocr = gr.Button("Text Recognition", variant="secondary")
365
+ btn_formula = gr.Button("Formula Recognition", variant="secondary")
366
  with gr.Row(elem_classes=["prompt-grid"]):
367
  btn_table = gr.Button("Table Recognition", variant="secondary")
368
  btn_chart = gr.Button("Chart Recognition", variant="secondary")
369
+
370
+ example_url_vl = gr.State(value=None)
371
+
372
  if targeted_recognition_examples:
373
  targeted_paths = [e[0] for e in targeted_recognition_examples]
374
  targeted_state = gr.State(targeted_paths)
375
  gr.Markdown("**Element-level Recognition Examples (Click an image to load)**")
376
  gallery_targeted = gr.Gallery(value=targeted_paths, columns=4, height=400, preview=False, label=None, allow_preview=False)
377
+
378
+ def on_gallery_select_for_vl(paths, evt):
379
+ url = _on_gallery_select(paths, evt)
380
+ return url, update_preview_visibility(url)
381
+
382
+ gallery_targeted.select(
383
+ fn=on_gallery_select_for_vl,
384
+ inputs=[targeted_state],
385
+ outputs=[example_url_vl, preview_vl_html]
386
+ )
387
 
388
  with gr.Column(scale=7):
389
  with gr.Tabs():
 
392
  with gr.Tab("Raw Output"):
393
  md_raw_vl = gr.Code(label="Raw Output", language="markdown")
394
 
395
+ def on_file_vl_change(fp):
396
+ return None, update_preview_visibility(fp)
397
+
398
+ file_vl.change(fn=on_file_vl_change, inputs=[file_vl], outputs=[example_url_vl, preview_vl_html])
399
+
400
+ def parse_vl_router(fp, example_url, prompt_choice):
401
+ src = fp if fp else example_url
402
+ if not src:
403
+ raise gr.Error("Please upload an image or pick an example first.")
404
+ return handle_targeted_recognition(src, prompt_choice)
405
+
406
+ btn_ocr.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Text Recognition")], outputs=[md_preview_vl, md_raw_vl])
407
+ btn_formula.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Formula Recognition")], outputs=[md_preview_vl, md_raw_vl])
408
+ btn_table.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Table Recognition")], outputs=[md_preview_vl, md_raw_vl])
409
+ btn_chart.click(fn=parse_vl_router, inputs=[file_vl, example_url_vl, gr.State("Chart Recognition")], outputs=[md_preview_vl, md_raw_vl])
410
 
411
  if __name__ == "__main__":
412
  port = int(os.getenv("PORT", "7860"))
413
+ demo.queue(max_size=64).launch(server_name="0.0.0.0", server_port=port, share=False)