devappsmi commited on
Commit
334822d
Β·
verified Β·
1 Parent(s): 063decf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +340 -173
app.py CHANGED
@@ -1,13 +1,7 @@
1
  """
2
  PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
3
  ====================================================
4
- Returns full JSON response matching the official Baidu API format, including:
5
- - layoutParsingResults[].prunedResult (blocks, labels, bboxes, polygon points)
6
- - layoutParsingResults[].markdown (text + images)
7
- - layoutParsingResults[].outputImages (visualization URLs)
8
- - layoutParsingResults[].inputImage
9
- - preprocessedImages
10
- - dataInfo
11
 
12
  Architecture:
13
  Gradio App β†’ This Bridge (port 7860) β†’ vLLM Docker (117.54.141.62:8000)
@@ -15,12 +9,13 @@ Architecture:
15
 
16
  import base64
17
  import json
 
18
  import os
19
  import shutil
20
  import tempfile
21
  import traceback
22
  import uuid
23
- from typing import Any, Dict, List, Optional
24
 
25
  import uvicorn
26
  from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
@@ -74,8 +69,8 @@ def get_pipeline():
74
  # =============================================================================
75
  app = FastAPI(
76
  title="PaddleOCR-VL-1.5 Bridge API",
77
- description="Full document parsing API matching official Baidu API format",
78
- version="1.0.0"
79
  )
80
 
81
  app.add_middleware(
@@ -99,7 +94,167 @@ def verify_auth(authorization: Optional[str] = None):
99
 
100
 
101
  # =============================================================================
102
- # Helpers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # =============================================================================
104
  TASK_PROMPTS = {
105
  "ocr": "OCR:",
@@ -138,7 +293,6 @@ def save_temp_image(file_data: str) -> str:
138
 
139
 
140
  def serve_file(src_path: str, request_id: str, filename: str) -> str:
141
- """Copy a file to the static dir and return its public URL."""
142
  static_subdir = os.path.join(STATIC_DIR, request_id)
143
  os.makedirs(static_subdir, exist_ok=True)
144
  dst_path = os.path.join(static_subdir, filename)
@@ -147,7 +301,6 @@ def serve_file(src_path: str, request_id: str, filename: str) -> str:
147
 
148
 
149
  def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
150
- """Find all images in a directory and serve them. Returns {filename: url}."""
151
  result = {}
152
  if not os.path.exists(directory):
153
  return result
@@ -156,7 +309,6 @@ def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
156
  ext = os.path.splitext(fname)[1].lower()
157
  if ext in IMAGE_EXTENSIONS:
158
  src = os.path.join(root, fname)
159
- # Preserve subdirectory structure in the filename
160
  rel_path = os.path.relpath(src, directory)
161
  safe_name = rel_path.replace(os.sep, "_")
162
  url = serve_file(src, request_id, safe_name)
@@ -164,100 +316,107 @@ def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
164
  return result
165
 
166
 
167
- def extract_pruned_result(res_obj, page_index: int = 0) -> Dict[str, Any]:
 
 
 
 
168
  """
169
- Extract the full prunedResult from a PaddleOCR result object,
170
- matching the official Baidu API format.
171
  """
172
- pruned = {}
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
- try:
175
- # Try to get the raw dict/json from the result object
176
- if hasattr(res_obj, 'json'):
177
- raw = res_obj.json if isinstance(res_obj.json, dict) else {}
178
- elif hasattr(res_obj, '_result'):
179
- raw = res_obj._result if isinstance(res_obj._result, dict) else {}
180
- elif hasattr(res_obj, 'to_dict'):
181
- raw = res_obj.to_dict()
182
- else:
183
- raw = {}
184
 
185
- # Try multiple attribute paths to find the parsing results
186
- parsing_res_list = []
187
- layout_det_res = {"boxes": []}
188
 
189
- # Check common attribute names
190
- for attr in ['parsing_res_list', 'parsing_result', 'blocks']:
191
- if hasattr(res_obj, attr):
192
- parsing_res_list = getattr(res_obj, attr, [])
193
- break
194
 
195
- # Check for layout detection results
196
- for attr in ['layout_det_res', 'layout_result', 'det_res']:
197
- if hasattr(res_obj, attr):
198
- layout_det_res = getattr(res_obj, attr, {})
199
- break
200
 
201
- # Get image dimensions
202
- width = 0
203
- height = 0
204
- for attr in ['img_width', 'width']:
205
- if hasattr(res_obj, attr):
206
- width = getattr(res_obj, attr, 0)
207
- break
208
- for attr in ['img_height', 'height']:
209
- if hasattr(res_obj, attr):
210
- height = getattr(res_obj, attr, 0)
211
- break
212
-
213
- # If we got raw dict, try to extract from it
214
- if raw and not parsing_res_list:
215
- parsing_res_list = raw.get('parsing_res_list', raw.get('blocks', []))
216
- layout_det_res = raw.get('layout_det_res', {"boxes": []})
217
- width = raw.get('width', width)
218
- height = raw.get('height', height)
219
-
220
- pruned = {
221
- "page_count": 1,
222
- "width": width,
223
- "height": height,
224
- "model_settings": {
225
- "use_doc_preprocessor": False,
226
- "use_layout_detection": True,
227
- "use_chart_recognition": False,
228
- "use_seal_recognition": True,
229
- "use_ocr_for_image_block": False,
230
- "format_block_content": True,
231
- "merge_layout_blocks": True,
232
- "markdown_ignore_labels": [
233
- "number", "footnote", "header",
234
- "header_image", "footer", "footer_image", "aside_text"
235
- ],
236
- "return_layout_polygon_points": True
237
- },
238
- "parsing_res_list": parsing_res_list if isinstance(parsing_res_list, list) else [],
239
- "layout_det_res": layout_det_res if isinstance(layout_det_res, dict) else {"boxes": []}
240
- }
241
 
242
- except Exception as e:
243
- print(f"Warning: Could not extract prunedResult: {e}")
244
- traceback.print_exc()
245
- pruned = {
246
- "page_count": 1,
247
- "width": 0,
248
- "height": 0,
249
- "model_settings": {},
250
- "parsing_res_list": [],
251
- "layout_det_res": {"boxes": []}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  }
 
253
 
254
- return pruned
255
 
 
 
 
256
 
257
  def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
258
  use_doc_unwarping: bool = True,
259
- use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
260
- """Full document parsing β€” returns response matching official Baidu API format."""
 
 
 
 
 
261
  tmp_path = save_temp_image(file_data)
262
  request_id = str(uuid.uuid4())[:12]
263
 
@@ -296,27 +455,24 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
296
  with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
297
  md_text = f.read()
298
 
299
- # --- Read JSON (contains prunedResult data) ---
300
  json_data = {}
301
  json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
302
  if json_files:
303
  with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
304
  json_data = json.load(f)
305
 
306
- # --- Collect and serve all images ---
307
  all_images = collect_images_from_dir(output_dir, page_id)
308
 
309
- # --- Build outputImages ---
310
  output_images = {}
311
  for rel_path, url in all_images.items():
312
  name = os.path.splitext(os.path.basename(rel_path))[0]
313
- # Identify layout detection visualization
314
  if "layout" in name.lower() or "det" in name.lower() or "vis" in name.lower():
315
  output_images["layout_det_res"] = url
316
  else:
317
  output_images[name] = url
318
 
319
- # --- Build markdown images map ---
320
  md_images = {}
321
  imgs_dir = os.path.join(output_dir, "imgs")
322
  if os.path.exists(imgs_dir):
@@ -327,17 +483,14 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
327
  url = serve_file(src, page_id, fname)
328
  local_ref = f"imgs/{fname}"
329
  md_images[local_ref] = url
330
- # Replace references in markdown
331
  md_text = md_text.replace(f'src="{local_ref}"', f'src="{url}"')
332
  md_text = md_text.replace(f']({local_ref})', f']({url})')
333
 
334
- # --- Serve input image ---
335
  input_image_url = serve_file(tmp_path, page_id, f"input_img_{i}.jpg")
336
 
337
- # --- Build prunedResult from JSON data or result object ---
338
  pruned_result = {}
339
  if json_data:
340
- # Try to use the saved JSON directly
341
  pruned_result = {
342
  "page_count": json_data.get("page_count", 1),
343
  "width": json_data.get("width", img_width),
@@ -362,14 +515,47 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
362
  json_data.get("det_res", {"boxes": []}))
363
  }
364
  else:
365
- pruned_result = extract_pruned_result(res, i)
 
 
 
 
 
 
 
366
 
367
- # Ensure dimensions are set
368
  if not pruned_result.get("width"):
369
  pruned_result["width"] = img_width
370
  if not pruned_result.get("height"):
371
  pruned_result["height"] = img_height
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  # --- Build page result ---
374
  page_result = {
375
  "prunedResult": pruned_result,
@@ -378,9 +564,12 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
378
  "images": md_images
379
  },
380
  "outputImages": output_images,
381
- "inputImage": input_image_url
382
  }
383
 
 
 
 
384
  layout_parsing_results.append(page_result)
385
  preprocessed_images.append(input_image_url)
386
  data_info_pages.append({
@@ -393,11 +582,8 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
393
  "result": {
394
  "layoutParsingResults": layout_parsing_results if layout_parsing_results else [{
395
  "prunedResult": {
396
- "page_count": 0,
397
- "width": 0,
398
- "height": 0,
399
- "parsing_res_list": [],
400
- "layout_det_res": {"boxes": []}
401
  },
402
  "markdown": {"text": "", "images": {}},
403
  "outputImages": {},
@@ -417,58 +603,6 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
417
  os.unlink(tmp_path)
418
 
419
 
420
- def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
421
- """Element-level recognition via direct vLLM call."""
422
- if file_data.startswith(("http://", "https://")):
423
- image_url = file_data
424
- else:
425
- image_url = f"data:image/png;base64,{file_data}"
426
-
427
- task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
428
-
429
- response = openai_client.chat.completions.create(
430
- model=VLLM_MODEL_NAME,
431
- messages=[{
432
- "role": "user",
433
- "content": [
434
- {"type": "image_url", "image_url": {"url": image_url}},
435
- {"type": "text", "text": task_prompt}
436
- ]
437
- }],
438
- temperature=0.0
439
- )
440
-
441
- result_text = response.choices[0].message.content
442
-
443
- return {
444
- "errorCode": 0,
445
- "result": {
446
- "layoutParsingResults": [{
447
- "prunedResult": {
448
- "page_count": 1,
449
- "width": 0,
450
- "height": 0,
451
- "parsing_res_list": [{
452
- "block_label": prompt_label,
453
- "block_content": result_text,
454
- "block_bbox": [],
455
- "block_id": 0,
456
- "block_order": 0,
457
- "group_id": 0,
458
- "global_block_id": 0,
459
- "global_group_id": 0,
460
- "block_polygon_points": []
461
- }],
462
- "layout_det_res": {"boxes": []}
463
- },
464
- "markdown": {"text": result_text, "images": {}},
465
- "outputImages": {},
466
- "prunedResult.spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
467
- }]
468
- }
469
- }
470
-
471
-
472
  def _parse_spotting(text: str) -> dict:
473
  try:
474
  return json.loads(text)
@@ -485,6 +619,7 @@ async def root():
485
  return {
486
  "service": "PaddleOCR-VL-1.5 Bridge API",
487
  "status": "running",
 
488
  "endpoints": ["/health", "/api/ocr", "/api/parse", "/api/parse/markdown", "/v1/chat/completions", "/docs"]
489
  }
490
 
@@ -498,7 +633,7 @@ async def health():
498
  async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
499
  """
500
  Main OCR endpoint β€” compatible with the Gradio app.
501
- Returns full JSON matching official Baidu API format.
502
 
503
  Body:
504
  {
@@ -507,7 +642,34 @@ async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(N
507
  "promptLabel": "ocr|formula|table|chart|spotting|seal",
508
  "useChartRecognition": false,
509
  "useDocUnwarping": true,
510
- "useDocOrientationClassify": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  }
512
  """
513
  verify_auth(authorization)
@@ -526,10 +688,14 @@ async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(N
526
  use_chart = body.get("useChartRecognition", False)
527
  use_unwarp = body.get("useDocUnwarping", True)
528
  use_orient = body.get("useDocOrientationClassify", True)
 
529
 
530
  try:
531
  if use_layout:
532
- return full_document_parsing(file_data, use_chart, use_unwarp, use_orient)
 
 
 
533
  else:
534
  return element_level_recognition(file_data, prompt_label)
535
  except Exception as e:
@@ -542,16 +708,17 @@ async def parse_file(
542
  file: UploadFile = File(...),
543
  use_layout_detection: bool = True,
544
  prompt_label: str = "ocr",
 
545
  authorization: Optional[str] = Header(None)
546
  ):
547
- """File upload endpoint."""
548
  verify_auth(authorization)
549
  content = await file.read()
550
  b64 = base64.b64encode(content).decode("utf-8")
551
 
552
  try:
553
  if use_layout_detection:
554
- return full_document_parsing(b64)
555
  else:
556
  return element_level_recognition(b64, prompt_label)
557
  except Exception as e:
@@ -570,7 +737,7 @@ async def parse_to_markdown(
570
  b64 = base64.b64encode(content).decode("utf-8")
571
 
572
  try:
573
- result = full_document_parsing(b64)
574
  pages = result.get("result", {}).get("layoutParsingResults", [])
575
  markdown_parts = [p.get("markdown", {}).get("text", "") for p in pages if p.get("markdown", {}).get("text")]
576
  return {
@@ -585,7 +752,7 @@ async def parse_to_markdown(
585
 
586
  @app.post("/v1/chat/completions")
587
  async def proxy_chat_completions(request: Request, authorization: Optional[str] = Header(None)):
588
- """Proxy to vLLM for direct OpenAI-compatible calls."""
589
  verify_auth(authorization)
590
 
591
  import httpx
@@ -607,18 +774,18 @@ if __name__ == "__main__":
607
  print(f"""
608
  ╔══════════════════════════════════════════════════════════════╗
609
  β•‘ PaddleOCR-VL-1.5 Bridge Server (HF Spaces) β•‘
 
610
  ╠══════════════════════════════════════════════════════════════╣
611
  β•‘ Bridge API: http://0.0.0.0:{BRIDGE_PORT} β•‘
612
  β•‘ vLLM backend: {VLLM_SERVER_URL:<44s}β•‘
613
  β•‘ Model: {VLLM_MODEL_NAME:<44s}β•‘
614
  β•‘ Auth: {"ENABLED" if API_KEY else "DISABLED":<44s}β•‘
615
- β•‘ Static URL: {PUBLIC_BASE_URL:<44s}β•‘
616
  ╠══════════════════════════════════════════════════════════════╣
617
  β•‘ Endpoints: β•‘
618
  β•‘ GET /health - Health check β•‘
619
  β•‘ GET /docs - Swagger UI β•‘
620
- β•‘ POST /api/ocr - Gradio-compatible API β•‘
621
- β•‘ POST /api/parse - File upload API β•‘
622
  β•‘ POST /api/parse/markdown - Simple markdown output β•‘
623
  β•‘ POST /v1/chat/completions - vLLM proxy (OpenAI format) β•‘
624
  β•‘ GET /static/... - Output images β•‘
 
1
  """
2
  PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
3
  ====================================================
4
+ With per-token and per-word confidence scores via vLLM logprobs.
 
 
 
 
 
 
5
 
6
  Architecture:
7
  Gradio App β†’ This Bridge (port 7860) β†’ vLLM Docker (117.54.141.62:8000)
 
9
 
10
  import base64
11
  import json
12
+ import math
13
  import os
14
  import shutil
15
  import tempfile
16
  import traceback
17
  import uuid
18
+ from typing import Any, Dict, List, Optional, Tuple
19
 
20
  import uvicorn
21
  from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
 
69
  # =============================================================================
70
  app = FastAPI(
71
  title="PaddleOCR-VL-1.5 Bridge API",
72
+ description="Full document parsing API with per-token/word confidence scores",
73
+ version="1.1.0"
74
  )
75
 
76
  app.add_middleware(
 
94
 
95
 
96
  # =============================================================================
97
+ # Confidence Score Helpers
98
+ # =============================================================================
99
+
100
+ def parse_logprobs(response) -> List[Dict[str, Any]]:
101
+ """
102
+ Extract per-token confidence from the OpenAI response logprobs.
103
+ Returns list of {token, logprob, confidence} dicts.
104
+ """
105
+ token_details = []
106
+
107
+ try:
108
+ choice = response.choices[0]
109
+ logprobs_data = choice.logprobs
110
+
111
+ if logprobs_data is None:
112
+ return token_details
113
+
114
+ # OpenAI format: logprobs.content is a list of token info
115
+ content_logprobs = getattr(logprobs_data, 'content', None)
116
+
117
+ if content_logprobs:
118
+ # OpenAI-compatible format (newer vLLM)
119
+ for token_info in content_logprobs:
120
+ token_str = getattr(token_info, 'token', '')
121
+ logprob_val = getattr(token_info, 'logprob', None)
122
+
123
+ if logprob_val is not None:
124
+ confidence = math.exp(logprob_val) # convert log prob to probability
125
+ else:
126
+ confidence = 0.0
127
+ logprob_val = float('-inf')
128
+
129
+ token_details.append({
130
+ "token": token_str,
131
+ "logprob": round(logprob_val, 6),
132
+ "confidence": round(confidence, 6)
133
+ })
134
+ else:
135
+ # Legacy vLLM format: logprobs has tokens, token_logprobs
136
+ tokens = getattr(logprobs_data, 'tokens', None)
137
+ token_logprobs = getattr(logprobs_data, 'token_logprobs', None)
138
+
139
+ if tokens and token_logprobs:
140
+ for token_str, logprob_val in zip(tokens, token_logprobs):
141
+ if logprob_val is not None:
142
+ confidence = math.exp(logprob_val)
143
+ else:
144
+ confidence = 0.0
145
+ logprob_val = float('-inf')
146
+
147
+ token_details.append({
148
+ "token": token_str,
149
+ "logprob": round(logprob_val, 6),
150
+ "confidence": round(confidence, 6)
151
+ })
152
+
153
+ except Exception as e:
154
+ print(f"Warning: Could not parse logprobs: {e}")
155
+ traceback.print_exc()
156
+
157
+ return token_details
158
+
159
+
160
+ def tokens_to_words(token_details: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
161
+ """
162
+ Group tokens into words. A new word starts when a token begins with a space
163
+ or is a newline. Returns list of {word, tokens, confidence, avg_logprob}.
164
+
165
+ Word confidence = geometric mean of token probabilities
166
+ = exp(mean of logprobs)
167
+ """
168
+ if not token_details:
169
+ return []
170
+
171
+ words = []
172
+ current_word_tokens = []
173
+
174
+ for td in token_details:
175
+ token = td["token"]
176
+
177
+ # Detect word boundary: starts with space, is newline, or is punctuation-only after text
178
+ is_boundary = (
179
+ token.startswith(" ") or
180
+ token.startswith("▁") or # sentencepiece space marker
181
+ token.startswith("Δ ") or # GPT-2 style space marker
182
+ token in ("\n", "\r", "\t", "\r\n") or
183
+ (len(current_word_tokens) > 0 and token.strip() == "")
184
+ )
185
+
186
+ if is_boundary and current_word_tokens:
187
+ # Finalize previous word
188
+ words.append(_finalize_word(current_word_tokens))
189
+ current_word_tokens = []
190
+
191
+ current_word_tokens.append(td)
192
+
193
+ # Don't forget the last word
194
+ if current_word_tokens:
195
+ words.append(_finalize_word(current_word_tokens))
196
+
197
+ return words
198
+
199
+
200
+ def _finalize_word(tokens: List[Dict[str, Any]]) -> Dict[str, Any]:
201
+ """Compute word-level confidence from its constituent tokens."""
202
+ # Reconstruct word text
203
+ word_text = "".join(t["token"] for t in tokens).strip()
204
+ # Remove sentencepiece/GPT markers
205
+ word_text = word_text.lstrip("▁Ġ ")
206
+
207
+ # Geometric mean of probabilities = exp(mean of logprobs)
208
+ valid_logprobs = [t["logprob"] for t in tokens if t["logprob"] != float('-inf')]
209
+ if valid_logprobs:
210
+ avg_logprob = sum(valid_logprobs) / len(valid_logprobs)
211
+ word_confidence = math.exp(avg_logprob)
212
+ else:
213
+ avg_logprob = float('-inf')
214
+ word_confidence = 0.0
215
+
216
+ return {
217
+ "word": word_text,
218
+ "confidence": round(word_confidence, 6),
219
+ "avg_logprob": round(avg_logprob, 6) if avg_logprob != float('-inf') else None,
220
+ "token_count": len(tokens),
221
+ "tokens": [
222
+ {"token": t["token"], "confidence": t["confidence"]}
223
+ for t in tokens
224
+ ]
225
+ }
226
+
227
+
228
+ def compute_overall_confidence(token_details: List[Dict[str, Any]]) -> Dict[str, Any]:
229
+ """Compute overall text confidence statistics."""
230
+ if not token_details:
231
+ return {"mean_confidence": 0.0, "min_confidence": 0.0, "total_tokens": 0}
232
+
233
+ confidences = [t["confidence"] for t in token_details]
234
+ logprobs = [t["logprob"] for t in token_details if t["logprob"] != float('-inf')]
235
+
236
+ mean_conf = sum(confidences) / len(confidences) if confidences else 0.0
237
+ min_conf = min(confidences) if confidences else 0.0
238
+ max_conf = max(confidences) if confidences else 0.0
239
+
240
+ # Perplexity = exp(-mean(logprobs)) β€” lower is more confident
241
+ if logprobs:
242
+ avg_logprob = sum(logprobs) / len(logprobs)
243
+ perplexity = math.exp(-avg_logprob)
244
+ else:
245
+ perplexity = float('inf')
246
+
247
+ return {
248
+ "mean_confidence": round(mean_conf, 6),
249
+ "min_confidence": round(min_conf, 6),
250
+ "max_confidence": round(max_conf, 6),
251
+ "perplexity": round(perplexity, 4) if perplexity != float('inf') else None,
252
+ "total_tokens": len(token_details)
253
+ }
254
+
255
+
256
+ # =============================================================================
257
+ # Image / File Helpers
258
  # =============================================================================
259
  TASK_PROMPTS = {
260
  "ocr": "OCR:",
 
293
 
294
 
295
  def serve_file(src_path: str, request_id: str, filename: str) -> str:
 
296
  static_subdir = os.path.join(STATIC_DIR, request_id)
297
  os.makedirs(static_subdir, exist_ok=True)
298
  dst_path = os.path.join(static_subdir, filename)
 
301
 
302
 
303
  def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
 
304
  result = {}
305
  if not os.path.exists(directory):
306
  return result
 
309
  ext = os.path.splitext(fname)[1].lower()
310
  if ext in IMAGE_EXTENSIONS:
311
  src = os.path.join(root, fname)
 
312
  rel_path = os.path.relpath(src, directory)
313
  safe_name = rel_path.replace(os.sep, "_")
314
  url = serve_file(src, request_id, safe_name)
 
316
  return result
317
 
318
 
319
+ # =============================================================================
320
+ # VLM call with confidence
321
+ # =============================================================================
322
+
323
+ def call_vllm_with_confidence(image_url: str, task_prompt: str) -> Tuple[str, List[Dict], List[Dict], Dict]:
324
  """
325
+ Call vLLM with logprobs enabled.
326
+ Returns: (result_text, token_confidences, word_confidences, overall_stats)
327
  """
328
+ response = openai_client.chat.completions.create(
329
+ model=VLLM_MODEL_NAME,
330
+ messages=[{
331
+ "role": "user",
332
+ "content": [
333
+ {"type": "image_url", "image_url": {"url": image_url}},
334
+ {"type": "text", "text": task_prompt}
335
+ ]
336
+ }],
337
+ temperature=0.0,
338
+ logprobs=True,
339
+ top_logprobs=5
340
+ )
341
 
342
+ result_text = response.choices[0].message.content
 
 
 
 
 
 
 
 
 
343
 
344
+ # Extract per-token confidence
345
+ token_details = parse_logprobs(response)
 
346
 
347
+ # Group into words
348
+ word_details = tokens_to_words(token_details)
 
 
 
349
 
350
+ # Overall stats
351
+ overall_stats = compute_overall_confidence(token_details)
 
 
 
352
 
353
+ return result_text, token_details, word_details, overall_stats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
+
356
+ # =============================================================================
357
+ # Element-level Recognition
358
+ # =============================================================================
359
+
360
+ def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
361
+ """Element-level recognition with confidence scores."""
362
+ if file_data.startswith(("http://", "https://")):
363
+ image_url = file_data
364
+ else:
365
+ image_url = f"data:image/png;base64,{file_data}"
366
+
367
+ task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
368
+
369
+ result_text, token_details, word_details, overall_stats = call_vllm_with_confidence(
370
+ image_url, task_prompt
371
+ )
372
+
373
+ return {
374
+ "errorCode": 0,
375
+ "result": {
376
+ "layoutParsingResults": [{
377
+ "prunedResult": {
378
+ "page_count": 1,
379
+ "width": 0,
380
+ "height": 0,
381
+ "parsing_res_list": [{
382
+ "block_label": prompt_label,
383
+ "block_content": result_text,
384
+ "block_bbox": [],
385
+ "block_id": 0,
386
+ "block_order": 0,
387
+ "group_id": 0,
388
+ "global_block_id": 0,
389
+ "global_group_id": 0,
390
+ "block_polygon_points": []
391
+ }],
392
+ "layout_det_res": {"boxes": []},
393
+ "spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
394
+ },
395
+ "markdown": {"text": result_text, "images": {}},
396
+ "outputImages": {},
397
+ "confidence": {
398
+ "overall": overall_stats,
399
+ "tokens": token_details,
400
+ "words": word_details
401
+ }
402
+ }]
403
  }
404
+ }
405
 
 
406
 
407
+ # =============================================================================
408
+ # Full Document Parsing
409
+ # =============================================================================
410
 
411
  def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
412
  use_doc_unwarping: bool = True,
413
+ use_doc_orientation_classify: bool = True,
414
+ include_confidence: bool = True) -> Dict[str, Any]:
415
+ """
416
+ Full document parsing with layout detection + VLM recognition.
417
+ When include_confidence=True, re-runs each block through vLLM with logprobs
418
+ to get per-token/word confidence scores.
419
+ """
420
  tmp_path = save_temp_image(file_data)
421
  request_id = str(uuid.uuid4())[:12]
422
 
 
455
  with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
456
  md_text = f.read()
457
 
458
+ # --- Read JSON ---
459
  json_data = {}
460
  json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
461
  if json_files:
462
  with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
463
  json_data = json.load(f)
464
 
465
+ # --- Collect and serve images ---
466
  all_images = collect_images_from_dir(output_dir, page_id)
467
 
 
468
  output_images = {}
469
  for rel_path, url in all_images.items():
470
  name = os.path.splitext(os.path.basename(rel_path))[0]
 
471
  if "layout" in name.lower() or "det" in name.lower() or "vis" in name.lower():
472
  output_images["layout_det_res"] = url
473
  else:
474
  output_images[name] = url
475
 
 
476
  md_images = {}
477
  imgs_dir = os.path.join(output_dir, "imgs")
478
  if os.path.exists(imgs_dir):
 
483
  url = serve_file(src, page_id, fname)
484
  local_ref = f"imgs/{fname}"
485
  md_images[local_ref] = url
 
486
  md_text = md_text.replace(f'src="{local_ref}"', f'src="{url}"')
487
  md_text = md_text.replace(f']({local_ref})', f']({url})')
488
 
 
489
  input_image_url = serve_file(tmp_path, page_id, f"input_img_{i}.jpg")
490
 
491
+ # --- Build prunedResult ---
492
  pruned_result = {}
493
  if json_data:
 
494
  pruned_result = {
495
  "page_count": json_data.get("page_count", 1),
496
  "width": json_data.get("width", img_width),
 
515
  json_data.get("det_res", {"boxes": []}))
516
  }
517
  else:
518
+ pruned_result = {
519
+ "page_count": 1,
520
+ "width": img_width,
521
+ "height": img_height,
522
+ "model_settings": {},
523
+ "parsing_res_list": [],
524
+ "layout_det_res": {"boxes": []}
525
+ }
526
 
 
527
  if not pruned_result.get("width"):
528
  pruned_result["width"] = img_width
529
  if not pruned_result.get("height"):
530
  pruned_result["height"] = img_height
531
 
532
+ # --- Confidence scores for each block ---
533
+ block_confidences = []
534
+ if include_confidence and pruned_result.get("parsing_res_list"):
535
+ # Use the full-page image for confidence scoring
536
+ if file_data.startswith(("http://", "https://")):
537
+ conf_image_url = file_data
538
+ else:
539
+ conf_image_url = f"data:image/png;base64,{file_data}"
540
+
541
+ # Get confidence for the entire page text
542
+ try:
543
+ _, page_tokens, page_words, page_overall = call_vllm_with_confidence(
544
+ conf_image_url, "OCR:"
545
+ )
546
+ block_confidences = {
547
+ "overall": page_overall,
548
+ "tokens": page_tokens,
549
+ "words": page_words
550
+ }
551
+ except Exception as e:
552
+ print(f"Warning: Could not get confidence scores: {e}")
553
+ block_confidences = {
554
+ "overall": {"mean_confidence": 0, "total_tokens": 0},
555
+ "tokens": [],
556
+ "words": []
557
+ }
558
+
559
  # --- Build page result ---
560
  page_result = {
561
  "prunedResult": pruned_result,
 
564
  "images": md_images
565
  },
566
  "outputImages": output_images,
567
+ "inputImage": input_image_url,
568
  }
569
 
570
+ if block_confidences:
571
+ page_result["confidence"] = block_confidences
572
+
573
  layout_parsing_results.append(page_result)
574
  preprocessed_images.append(input_image_url)
575
  data_info_pages.append({
 
582
  "result": {
583
  "layoutParsingResults": layout_parsing_results if layout_parsing_results else [{
584
  "prunedResult": {
585
+ "page_count": 0, "width": 0, "height": 0,
586
+ "parsing_res_list": [], "layout_det_res": {"boxes": []}
 
 
 
587
  },
588
  "markdown": {"text": "", "images": {}},
589
  "outputImages": {},
 
603
  os.unlink(tmp_path)
604
 
605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  def _parse_spotting(text: str) -> dict:
607
  try:
608
  return json.loads(text)
 
619
  return {
620
  "service": "PaddleOCR-VL-1.5 Bridge API",
621
  "status": "running",
622
+ "version": "1.1.0 (with confidence scores)",
623
  "endpoints": ["/health", "/api/ocr", "/api/parse", "/api/parse/markdown", "/v1/chat/completions", "/docs"]
624
  }
625
 
 
633
  async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
634
  """
635
  Main OCR endpoint β€” compatible with the Gradio app.
636
+ Now includes per-token and per-word confidence scores.
637
 
638
  Body:
639
  {
 
642
  "promptLabel": "ocr|formula|table|chart|spotting|seal",
643
  "useChartRecognition": false,
644
  "useDocUnwarping": true,
645
+ "useDocOrientationClassify": true,
646
+ "includeConfidence": true (default: true)
647
+ }
648
+
649
+ Response includes:
650
+ {
651
+ "result": {
652
+ "layoutParsingResults": [{
653
+ ...
654
+ "confidence": {
655
+ "overall": {
656
+ "mean_confidence": 0.95,
657
+ "min_confidence": 0.42,
658
+ "max_confidence": 1.0,
659
+ "perplexity": 1.12,
660
+ "total_tokens": 85
661
+ },
662
+ "tokens": [
663
+ {"token": "Hello", "logprob": -0.02, "confidence": 0.98},
664
+ ...
665
+ ],
666
+ "words": [
667
+ {"word": "Hello", "confidence": 0.98, "avg_logprob": -0.02, "token_count": 1, "tokens": [...]},
668
+ ...
669
+ ]
670
+ }
671
+ }]
672
+ }
673
  }
674
  """
675
  verify_auth(authorization)
 
688
  use_chart = body.get("useChartRecognition", False)
689
  use_unwarp = body.get("useDocUnwarping", True)
690
  use_orient = body.get("useDocOrientationClassify", True)
691
+ include_confidence = body.get("includeConfidence", True)
692
 
693
  try:
694
  if use_layout:
695
+ return full_document_parsing(
696
+ file_data, use_chart, use_unwarp, use_orient,
697
+ include_confidence=include_confidence
698
+ )
699
  else:
700
  return element_level_recognition(file_data, prompt_label)
701
  except Exception as e:
 
708
  file: UploadFile = File(...),
709
  use_layout_detection: bool = True,
710
  prompt_label: str = "ocr",
711
+ include_confidence: bool = True,
712
  authorization: Optional[str] = Header(None)
713
  ):
714
+ """File upload endpoint with confidence scores."""
715
  verify_auth(authorization)
716
  content = await file.read()
717
  b64 = base64.b64encode(content).decode("utf-8")
718
 
719
  try:
720
  if use_layout_detection:
721
+ return full_document_parsing(b64, include_confidence=include_confidence)
722
  else:
723
  return element_level_recognition(b64, prompt_label)
724
  except Exception as e:
 
737
  b64 = base64.b64encode(content).decode("utf-8")
738
 
739
  try:
740
+ result = full_document_parsing(b64, include_confidence=False)
741
  pages = result.get("result", {}).get("layoutParsingResults", [])
742
  markdown_parts = [p.get("markdown", {}).get("text", "") for p in pages if p.get("markdown", {}).get("text")]
743
  return {
 
752
 
753
  @app.post("/v1/chat/completions")
754
  async def proxy_chat_completions(request: Request, authorization: Optional[str] = Header(None)):
755
+ """Proxy to vLLM for direct OpenAI-compatible calls (logprobs supported)."""
756
  verify_auth(authorization)
757
 
758
  import httpx
 
774
  print(f"""
775
  ╔══════════════════════════════════════════════════════════════╗
776
  β•‘ PaddleOCR-VL-1.5 Bridge Server (HF Spaces) β•‘
777
+ β•‘ v1.1.0 β€” with per-token/word confidence scores β•‘
778
  ╠══════════════════════════════════════════════════════════════╣
779
  β•‘ Bridge API: http://0.0.0.0:{BRIDGE_PORT} β•‘
780
  β•‘ vLLM backend: {VLLM_SERVER_URL:<44s}β•‘
781
  β•‘ Model: {VLLM_MODEL_NAME:<44s}β•‘
782
  β•‘ Auth: {"ENABLED" if API_KEY else "DISABLED":<44s}β•‘
 
783
  ╠══════════════════════════════════════════════════════════════╣
784
  β•‘ Endpoints: β•‘
785
  β•‘ GET /health - Health check β•‘
786
  β•‘ GET /docs - Swagger UI β•‘
787
+ β•‘ POST /api/ocr - Gradio-compatible + confidenceβ•‘
788
+ β•‘ POST /api/parse - File upload + confidence β•‘
789
  β•‘ POST /api/parse/markdown - Simple markdown output β•‘
790
  β•‘ POST /v1/chat/completions - vLLM proxy (OpenAI format) β•‘
791
  β•‘ GET /static/... - Output images β•‘