Spaces:

devappsmi
/

document_parse

Sleeping

App Files Files Community

devappsmi commited on Feb 17

Commit

334822d

verified ·

1 Parent(s): 063decf

Update app.py

Browse files

Files changed (1) hide show

app.py +340 -173

app.py CHANGED Viewed

@@ -1,13 +1,7 @@
 """
 PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
 ====================================================
-Returns full JSON response matching the official Baidu API format, including:
-- layoutParsingResults[].prunedResult (blocks, labels, bboxes, polygon points)
-- layoutParsingResults[].markdown (text + images)
-- layoutParsingResults[].outputImages (visualization URLs)
-- layoutParsingResults[].inputImage
-- preprocessedImages
-- dataInfo
 Architecture:
     Gradio App → This Bridge (port 7860) → vLLM Docker (117.54.141.62:8000)
@@ -15,12 +9,13 @@ Architecture:
 import base64
 import json
 import os
 import shutil
 import tempfile
 import traceback
 import uuid
-from typing import Any, Dict, List, Optional
 import uvicorn
 from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
@@ -74,8 +69,8 @@ def get_pipeline():
 # =============================================================================
 app = FastAPI(
     title="PaddleOCR-VL-1.5 Bridge API",
-    description="Full document parsing API matching official Baidu API format",
-    version="1.0.0"
 )
 app.add_middleware(
@@ -99,7 +94,167 @@ def verify_auth(authorization: Optional[str] = None):
 # =============================================================================
-# Helpers
 # =============================================================================
 TASK_PROMPTS = {
     "ocr": "OCR:",
@@ -138,7 +293,6 @@ def save_temp_image(file_data: str) -> str:
 def serve_file(src_path: str, request_id: str, filename: str) -> str:
-    """Copy a file to the static dir and return its public URL."""
     static_subdir = os.path.join(STATIC_DIR, request_id)
     os.makedirs(static_subdir, exist_ok=True)
     dst_path = os.path.join(static_subdir, filename)
@@ -147,7 +301,6 @@ def serve_file(src_path: str, request_id: str, filename: str) -> str:
 def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
-    """Find all images in a directory and serve them. Returns {filename: url}."""
     result = {}
     if not os.path.exists(directory):
         return result
@@ -156,7 +309,6 @@ def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
             ext = os.path.splitext(fname)[1].lower()
             if ext in IMAGE_EXTENSIONS:
                 src = os.path.join(root, fname)
-                # Preserve subdirectory structure in the filename
                 rel_path = os.path.relpath(src, directory)
                 safe_name = rel_path.replace(os.sep, "_")
                 url = serve_file(src, request_id, safe_name)
@@ -164,100 +316,107 @@ def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
     return result
-def extract_pruned_result(res_obj, page_index: int = 0) -> Dict[str, Any]:
     """
-    Extract the full prunedResult from a PaddleOCR result object,
-    matching the official Baidu API format.
     """
-    pruned = {}
-    try:
-        # Try to get the raw dict/json from the result object
-        if hasattr(res_obj, 'json'):
-            raw = res_obj.json if isinstance(res_obj.json, dict) else {}
-        elif hasattr(res_obj, '_result'):
-            raw = res_obj._result if isinstance(res_obj._result, dict) else {}
-        elif hasattr(res_obj, 'to_dict'):
-            raw = res_obj.to_dict()
-        else:
-            raw = {}
-        # Try multiple attribute paths to find the parsing results
-        parsing_res_list = []
-        layout_det_res = {"boxes": []}
-        # Check common attribute names
-        for attr in ['parsing_res_list', 'parsing_result', 'blocks']:
-            if hasattr(res_obj, attr):
-                parsing_res_list = getattr(res_obj, attr, [])
-                break
-        # Check for layout detection results
-        for attr in ['layout_det_res', 'layout_result', 'det_res']:
-            if hasattr(res_obj, attr):
-                layout_det_res = getattr(res_obj, attr, {})
-                break
-        # Get image dimensions
-        width = 0
-        height = 0
-        for attr in ['img_width', 'width']:
-            if hasattr(res_obj, attr):
-                width = getattr(res_obj, attr, 0)
-                break
-        for attr in ['img_height', 'height']:
-            if hasattr(res_obj, attr):
-                height = getattr(res_obj, attr, 0)
-                break
-        # If we got raw dict, try to extract from it
-        if raw and not parsing_res_list:
-            parsing_res_list = raw.get('parsing_res_list', raw.get('blocks', []))
-            layout_det_res = raw.get('layout_det_res', {"boxes": []})
-            width = raw.get('width', width)
-            height = raw.get('height', height)
-        pruned = {
-            "page_count": 1,
-            "width": width,
-            "height": height,
-            "model_settings": {
-                "use_doc_preprocessor": False,
-                "use_layout_detection": True,
-                "use_chart_recognition": False,
-                "use_seal_recognition": True,
-                "use_ocr_for_image_block": False,
-                "format_block_content": True,
-                "merge_layout_blocks": True,
-                "markdown_ignore_labels": [
-                    "number", "footnote", "header",
-                    "header_image", "footer", "footer_image", "aside_text"
-                ],
-                "return_layout_polygon_points": True
-            },
-            "parsing_res_list": parsing_res_list if isinstance(parsing_res_list, list) else [],
-            "layout_det_res": layout_det_res if isinstance(layout_det_res, dict) else {"boxes": []}
-        }
-    except Exception as e:
-        print(f"Warning: Could not extract prunedResult: {e}")
-        traceback.print_exc()
-        pruned = {
-            "page_count": 1,
-            "width": 0,
-            "height": 0,
-            "model_settings": {},
-            "parsing_res_list": [],
-            "layout_det_res": {"boxes": []}
         }
-    return pruned
 def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                           use_doc_unwarping: bool = True,
-                          use_doc_orientation_classify: bool = True) -> Dict[str, Any]:
-    """Full document parsing — returns response matching official Baidu API format."""
     tmp_path = save_temp_image(file_data)
     request_id = str(uuid.uuid4())[:12]
@@ -296,27 +455,24 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                 with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
                     md_text = f.read()
-            # --- Read JSON (contains prunedResult data) ---
             json_data = {}
             json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
             if json_files:
                 with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
                     json_data = json.load(f)
-            # --- Collect and serve all images ---
             all_images = collect_images_from_dir(output_dir, page_id)
-            # --- Build outputImages ---
             output_images = {}
             for rel_path, url in all_images.items():
                 name = os.path.splitext(os.path.basename(rel_path))[0]
-                # Identify layout detection visualization
                 if "layout" in name.lower() or "det" in name.lower() or "vis" in name.lower():
                     output_images["layout_det_res"] = url
                 else:
                     output_images[name] = url
-            # --- Build markdown images map ---
             md_images = {}
             imgs_dir = os.path.join(output_dir, "imgs")
             if os.path.exists(imgs_dir):
@@ -327,17 +483,14 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                         url = serve_file(src, page_id, fname)
                         local_ref = f"imgs/{fname}"
                         md_images[local_ref] = url
-                        # Replace references in markdown
                         md_text = md_text.replace(f'src="{local_ref}"', f'src="{url}"')
                         md_text = md_text.replace(f']({local_ref})', f']({url})')
-            # --- Serve input image ---
             input_image_url = serve_file(tmp_path, page_id, f"input_img_{i}.jpg")
-            # --- Build prunedResult from JSON data or result object ---
             pruned_result = {}
             if json_data:
-                # Try to use the saved JSON directly
                 pruned_result = {
                     "page_count": json_data.get("page_count", 1),
                     "width": json_data.get("width", img_width),
@@ -362,14 +515,47 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                                       json_data.get("det_res", {"boxes": []}))
                 }
             else:
-                pruned_result = extract_pruned_result(res, i)
-            # Ensure dimensions are set
             if not pruned_result.get("width"):
                 pruned_result["width"] = img_width
             if not pruned_result.get("height"):
                 pruned_result["height"] = img_height
             # --- Build page result ---
             page_result = {
                 "prunedResult": pruned_result,
@@ -378,9 +564,12 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                     "images": md_images
                 },
                 "outputImages": output_images,
-                "inputImage": input_image_url
             }
             layout_parsing_results.append(page_result)
             preprocessed_images.append(input_image_url)
             data_info_pages.append({
@@ -393,11 +582,8 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
             "result": {
                 "layoutParsingResults": layout_parsing_results if layout_parsing_results else [{
                     "prunedResult": {
-                        "page_count": 0,
-                        "width": 0,
-                        "height": 0,
-                        "parsing_res_list": [],
-                        "layout_det_res": {"boxes": []}
                     },
                     "markdown": {"text": "", "images": {}},
                     "outputImages": {},
@@ -417,58 +603,6 @@ def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
             os.unlink(tmp_path)
-def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
-    """Element-level recognition via direct vLLM call."""
-    if file_data.startswith(("http://", "https://")):
-        image_url = file_data
-    else:
-        image_url = f"data:image/png;base64,{file_data}"
-    task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
-    response = openai_client.chat.completions.create(
-        model=VLLM_MODEL_NAME,
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": task_prompt}
-            ]
-        }],
-        temperature=0.0
-    )
-    result_text = response.choices[0].message.content
-    return {
-        "errorCode": 0,
-        "result": {
-            "layoutParsingResults": [{
-                "prunedResult": {
-                    "page_count": 1,
-                    "width": 0,
-                    "height": 0,
-                    "parsing_res_list": [{
-                        "block_label": prompt_label,
-                        "block_content": result_text,
-                        "block_bbox": [],
-                        "block_id": 0,
-                        "block_order": 0,
-                        "group_id": 0,
-                        "global_block_id": 0,
-                        "global_group_id": 0,
-                        "block_polygon_points": []
-                    }],
-                    "layout_det_res": {"boxes": []}
-                },
-                "markdown": {"text": result_text, "images": {}},
-                "outputImages": {},
-                "prunedResult.spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
-            }]
-        }
-    }
 def _parse_spotting(text: str) -> dict:
     try:
         return json.loads(text)
@@ -485,6 +619,7 @@ async def root():
     return {
         "service": "PaddleOCR-VL-1.5 Bridge API",
         "status": "running",
         "endpoints": ["/health", "/api/ocr", "/api/parse", "/api/parse/markdown", "/v1/chat/completions", "/docs"]
     }
@@ -498,7 +633,7 @@ async def health():
 async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
     """
     Main OCR endpoint — compatible with the Gradio app.
-    Returns full JSON matching official Baidu API format.
     Body:
     {
@@ -507,7 +642,34 @@ async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(N
         "promptLabel": "ocr|formula|table|chart|spotting|seal",
         "useChartRecognition": false,
         "useDocUnwarping": true,
-        "useDocOrientationClassify": true
     }
     """
     verify_auth(authorization)
@@ -526,10 +688,14 @@ async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(N
     use_chart = body.get("useChartRecognition", False)
     use_unwarp = body.get("useDocUnwarping", True)
     use_orient = body.get("useDocOrientationClassify", True)
     try:
         if use_layout:
-            return full_document_parsing(file_data, use_chart, use_unwarp, use_orient)
         else:
             return element_level_recognition(file_data, prompt_label)
     except Exception as e:
@@ -542,16 +708,17 @@ async def parse_file(
     file: UploadFile = File(...),
     use_layout_detection: bool = True,
     prompt_label: str = "ocr",
     authorization: Optional[str] = Header(None)
 ):
-    """File upload endpoint."""
     verify_auth(authorization)
     content = await file.read()
     b64 = base64.b64encode(content).decode("utf-8")
     try:
         if use_layout_detection:
-            return full_document_parsing(b64)
         else:
             return element_level_recognition(b64, prompt_label)
     except Exception as e:
@@ -570,7 +737,7 @@ async def parse_to_markdown(
     b64 = base64.b64encode(content).decode("utf-8")
     try:
-        result = full_document_parsing(b64)
         pages = result.get("result", {}).get("layoutParsingResults", [])
         markdown_parts = [p.get("markdown", {}).get("text", "") for p in pages if p.get("markdown", {}).get("text")]
         return {
@@ -585,7 +752,7 @@ async def parse_to_markdown(
 @app.post("/v1/chat/completions")
 async def proxy_chat_completions(request: Request, authorization: Optional[str] = Header(None)):
-    """Proxy to vLLM for direct OpenAI-compatible calls."""
     verify_auth(authorization)
     import httpx
@@ -607,18 +774,18 @@ if __name__ == "__main__":
     print(f"""
 ╔══════════════════════════════════════════════════════════════╗
 ║     PaddleOCR-VL-1.5 Bridge Server (HF Spaces)             ║
 ╠══════════════════════════════════════════════════════════════╣
 ║  Bridge API:   http://0.0.0.0:{BRIDGE_PORT}                          ║
 ║  vLLM backend: {VLLM_SERVER_URL:<44s}║
 ║  Model:        {VLLM_MODEL_NAME:<44s}║
 ║  Auth:         {"ENABLED" if API_KEY else "DISABLED":<44s}║
-║  Static URL:   {PUBLIC_BASE_URL:<44s}║
 ╠══════════════════════════════════════════════════════════════╣
 ║  Endpoints:                                                  ║
 ║    GET  /health              - Health check                  ║
 ║    GET  /docs                - Swagger UI                    ║
-║    POST /api/ocr             - Gradio-compatible API         ║
-║    POST /api/parse           - File upload API               ║
 ║    POST /api/parse/markdown  - Simple markdown output        ║
 ║    POST /v1/chat/completions - vLLM proxy (OpenAI format)    ║
 ║    GET  /static/...          - Output images                 ║

 """
 PaddleOCR-VL-1.5 Bridge Server (HF Spaces Edition)
 ====================================================
+With per-token and per-word confidence scores via vLLM logprobs.
 Architecture:
     Gradio App → This Bridge (port 7860) → vLLM Docker (117.54.141.62:8000)
 import base64
 import json
+import math
 import os
 import shutil
 import tempfile
 import traceback
 import uuid
+from typing import Any, Dict, List, Optional, Tuple
 import uvicorn
 from fastapi import FastAPI, File, Header, HTTPException, Request, UploadFile
 # =============================================================================
 app = FastAPI(
     title="PaddleOCR-VL-1.5 Bridge API",
+    description="Full document parsing API with per-token/word confidence scores",
+    version="1.1.0"
 )
 app.add_middleware(
 # =============================================================================
+# Confidence Score Helpers
+# =============================================================================
+def parse_logprobs(response) -> List[Dict[str, Any]]:
+    """
+    Extract per-token confidence from the OpenAI response logprobs.
+    Returns list of {token, logprob, confidence} dicts.
+    """
+    token_details = []
+    try:
+        choice = response.choices[0]
+        logprobs_data = choice.logprobs
+        if logprobs_data is None:
+            return token_details
+        # OpenAI format: logprobs.content is a list of token info
+        content_logprobs = getattr(logprobs_data, 'content', None)
+        if content_logprobs:
+            # OpenAI-compatible format (newer vLLM)
+            for token_info in content_logprobs:
+                token_str = getattr(token_info, 'token', '')
+                logprob_val = getattr(token_info, 'logprob', None)
+                if logprob_val is not None:
+                    confidence = math.exp(logprob_val)  # convert log prob to probability
+                else:
+                    confidence = 0.0
+                    logprob_val = float('-inf')
+                token_details.append({
+                    "token": token_str,
+                    "logprob": round(logprob_val, 6),
+                    "confidence": round(confidence, 6)
+                })
+        else:
+            # Legacy vLLM format: logprobs has tokens, token_logprobs
+            tokens = getattr(logprobs_data, 'tokens', None)
+            token_logprobs = getattr(logprobs_data, 'token_logprobs', None)
+            if tokens and token_logprobs:
+                for token_str, logprob_val in zip(tokens, token_logprobs):
+                    if logprob_val is not None:
+                        confidence = math.exp(logprob_val)
+                    else:
+                        confidence = 0.0
+                        logprob_val = float('-inf')
+                    token_details.append({
+                        "token": token_str,
+                        "logprob": round(logprob_val, 6),
+                        "confidence": round(confidence, 6)
+                    })
+    except Exception as e:
+        print(f"Warning: Could not parse logprobs: {e}")
+        traceback.print_exc()
+    return token_details
+def tokens_to_words(token_details: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Group tokens into words. A new word starts when a token begins with a space
+    or is a newline. Returns list of {word, tokens, confidence, avg_logprob}.
+    Word confidence = geometric mean of token probabilities
+                    = exp(mean of logprobs)
+    """
+    if not token_details:
+        return []
+    words = []
+    current_word_tokens = []
+    for td in token_details:
+        token = td["token"]
+        # Detect word boundary: starts with space, is newline, or is punctuation-only after text
+        is_boundary = (
+            token.startswith(" ") or
+            token.startswith("▁") or  # sentencepiece space marker
+            token.startswith("Ġ") or  # GPT-2 style space marker
+            token in ("\n", "\r", "\t", "\r\n") or
+            (len(current_word_tokens) > 0 and token.strip() == "")
+        )
+        if is_boundary and current_word_tokens:
+            # Finalize previous word
+            words.append(_finalize_word(current_word_tokens))
+            current_word_tokens = []
+        current_word_tokens.append(td)
+    # Don't forget the last word
+    if current_word_tokens:
+        words.append(_finalize_word(current_word_tokens))
+    return words
+def _finalize_word(tokens: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Compute word-level confidence from its constituent tokens."""
+    # Reconstruct word text
+    word_text = "".join(t["token"] for t in tokens).strip()
+    # Remove sentencepiece/GPT markers
+    word_text = word_text.lstrip("▁Ġ ")
+    # Geometric mean of probabilities = exp(mean of logprobs)
+    valid_logprobs = [t["logprob"] for t in tokens if t["logprob"] != float('-inf')]
+    if valid_logprobs:
+        avg_logprob = sum(valid_logprobs) / len(valid_logprobs)
+        word_confidence = math.exp(avg_logprob)
+    else:
+        avg_logprob = float('-inf')
+        word_confidence = 0.0
+    return {
+        "word": word_text,
+        "confidence": round(word_confidence, 6),
+        "avg_logprob": round(avg_logprob, 6) if avg_logprob != float('-inf') else None,
+        "token_count": len(tokens),
+        "tokens": [
+            {"token": t["token"], "confidence": t["confidence"]}
+            for t in tokens
+        ]
+    }
+def compute_overall_confidence(token_details: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Compute overall text confidence statistics."""
+    if not token_details:
+        return {"mean_confidence": 0.0, "min_confidence": 0.0, "total_tokens": 0}
+    confidences = [t["confidence"] for t in token_details]
+    logprobs = [t["logprob"] for t in token_details if t["logprob"] != float('-inf')]
+    mean_conf = sum(confidences) / len(confidences) if confidences else 0.0
+    min_conf = min(confidences) if confidences else 0.0
+    max_conf = max(confidences) if confidences else 0.0
+    # Perplexity = exp(-mean(logprobs)) — lower is more confident
+    if logprobs:
+        avg_logprob = sum(logprobs) / len(logprobs)
+        perplexity = math.exp(-avg_logprob)
+    else:
+        perplexity = float('inf')
+    return {
+        "mean_confidence": round(mean_conf, 6),
+        "min_confidence": round(min_conf, 6),
+        "max_confidence": round(max_conf, 6),
+        "perplexity": round(perplexity, 4) if perplexity != float('inf') else None,
+        "total_tokens": len(token_details)
+    }
+# =============================================================================
+# Image / File Helpers
 # =============================================================================
 TASK_PROMPTS = {
     "ocr": "OCR:",
 def serve_file(src_path: str, request_id: str, filename: str) -> str:
     static_subdir = os.path.join(STATIC_DIR, request_id)
     os.makedirs(static_subdir, exist_ok=True)
     dst_path = os.path.join(static_subdir, filename)
 def collect_images_from_dir(directory: str, request_id: str) -> Dict[str, str]:
     result = {}
     if not os.path.exists(directory):
         return result
             ext = os.path.splitext(fname)[1].lower()
             if ext in IMAGE_EXTENSIONS:
                 src = os.path.join(root, fname)
                 rel_path = os.path.relpath(src, directory)
                 safe_name = rel_path.replace(os.sep, "_")
                 url = serve_file(src, request_id, safe_name)
     return result
+# =============================================================================
+# VLM call with confidence
+# =============================================================================
+def call_vllm_with_confidence(image_url: str, task_prompt: str) -> Tuple[str, List[Dict], List[Dict], Dict]:
     """
+    Call vLLM with logprobs enabled.
+    Returns: (result_text, token_confidences, word_confidences, overall_stats)
     """
+    response = openai_client.chat.completions.create(
+        model=VLLM_MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": task_prompt}
+            ]
+        }],
+        temperature=0.0,
+        logprobs=True,
+        top_logprobs=5
+    )
+    result_text = response.choices[0].message.content
+    # Extract per-token confidence
+    token_details = parse_logprobs(response)
+    # Group into words
+    word_details = tokens_to_words(token_details)
+    # Overall stats
+    overall_stats = compute_overall_confidence(token_details)
+    return result_text, token_details, word_details, overall_stats
+# =============================================================================
+# Element-level Recognition
+# =============================================================================
+def element_level_recognition(file_data: str, prompt_label: str) -> Dict[str, Any]:
+    """Element-level recognition with confidence scores."""
+    if file_data.startswith(("http://", "https://")):
+        image_url = file_data
+    else:
+        image_url = f"data:image/png;base64,{file_data}"
+    task_prompt = TASK_PROMPTS.get(prompt_label, "OCR:")
+    result_text, token_details, word_details, overall_stats = call_vllm_with_confidence(
+        image_url, task_prompt
+    )
+    return {
+        "errorCode": 0,
+        "result": {
+            "layoutParsingResults": [{
+                "prunedResult": {
+                    "page_count": 1,
+                    "width": 0,
+                    "height": 0,
+                    "parsing_res_list": [{
+                        "block_label": prompt_label,
+                        "block_content": result_text,
+                        "block_bbox": [],
+                        "block_id": 0,
+                        "block_order": 0,
+                        "group_id": 0,
+                        "global_block_id": 0,
+                        "global_group_id": 0,
+                        "block_polygon_points": []
+                    }],
+                    "layout_det_res": {"boxes": []},
+                    "spotting_res": _parse_spotting(result_text) if prompt_label == "spotting" else {}
+                },
+                "markdown": {"text": result_text, "images": {}},
+                "outputImages": {},
+                "confidence": {
+                    "overall": overall_stats,
+                    "tokens": token_details,
+                    "words": word_details
+                }
+            }]
         }
+    }
+# =============================================================================
+# Full Document Parsing
+# =============================================================================
 def full_document_parsing(file_data: str, use_chart_recognition: bool = False,
                           use_doc_unwarping: bool = True,
+                          use_doc_orientation_classify: bool = True,
+                          include_confidence: bool = True) -> Dict[str, Any]:
+    """
+    Full document parsing with layout detection + VLM recognition.
+    When include_confidence=True, re-runs each block through vLLM with logprobs
+    to get per-token/word confidence scores.
+    """
     tmp_path = save_temp_image(file_data)
     request_id = str(uuid.uuid4())[:12]
                 with open(os.path.join(output_dir, md_files[0]), "r", encoding="utf-8") as f:
                     md_text = f.read()
+            # --- Read JSON ---
             json_data = {}
             json_files = [f for f in os.listdir(output_dir) if f.endswith(".json")]
             if json_files:
                 with open(os.path.join(output_dir, json_files[0]), "r", encoding="utf-8") as f:
                     json_data = json.load(f)
+            # --- Collect and serve images ---
             all_images = collect_images_from_dir(output_dir, page_id)
             output_images = {}
             for rel_path, url in all_images.items():
                 name = os.path.splitext(os.path.basename(rel_path))[0]
                 if "layout" in name.lower() or "det" in name.lower() or "vis" in name.lower():
                     output_images["layout_det_res"] = url
                 else:
                     output_images[name] = url
             md_images = {}
             imgs_dir = os.path.join(output_dir, "imgs")
             if os.path.exists(imgs_dir):
                         url = serve_file(src, page_id, fname)
                         local_ref = f"imgs/{fname}"
                         md_images[local_ref] = url
                         md_text = md_text.replace(f'src="{local_ref}"', f'src="{url}"')
                         md_text = md_text.replace(f']({local_ref})', f']({url})')
             input_image_url = serve_file(tmp_path, page_id, f"input_img_{i}.jpg")
+            # --- Build prunedResult ---
             pruned_result = {}
             if json_data:
                 pruned_result = {
                     "page_count": json_data.get("page_count", 1),
                     "width": json_data.get("width", img_width),
                                       json_data.get("det_res", {"boxes": []}))
                 }
             else:
+                pruned_result = {
+                    "page_count": 1,
+                    "width": img_width,
+                    "height": img_height,
+                    "model_settings": {},
+                    "parsing_res_list": [],
+                    "layout_det_res": {"boxes": []}
+                }
             if not pruned_result.get("width"):
                 pruned_result["width"] = img_width
             if not pruned_result.get("height"):
                 pruned_result["height"] = img_height
+            # --- Confidence scores for each block ---
+            block_confidences = []
+            if include_confidence and pruned_result.get("parsing_res_list"):
+                # Use the full-page image for confidence scoring
+                if file_data.startswith(("http://", "https://")):
+                    conf_image_url = file_data
+                else:
+                    conf_image_url = f"data:image/png;base64,{file_data}"
+                # Get confidence for the entire page text
+                try:
+                    _, page_tokens, page_words, page_overall = call_vllm_with_confidence(
+                        conf_image_url, "OCR:"
+                    )
+                    block_confidences = {
+                        "overall": page_overall,
+                        "tokens": page_tokens,
+                        "words": page_words
+                    }
+                except Exception as e:
+                    print(f"Warning: Could not get confidence scores: {e}")
+                    block_confidences = {
+                        "overall": {"mean_confidence": 0, "total_tokens": 0},
+                        "tokens": [],
+                        "words": []
+                    }
             # --- Build page result ---
             page_result = {
                 "prunedResult": pruned_result,
                     "images": md_images
                 },
                 "outputImages": output_images,
+                "inputImage": input_image_url,
             }
+            if block_confidences:
+                page_result["confidence"] = block_confidences
             layout_parsing_results.append(page_result)
             preprocessed_images.append(input_image_url)
             data_info_pages.append({
             "result": {
                 "layoutParsingResults": layout_parsing_results if layout_parsing_results else [{
                     "prunedResult": {
+                        "page_count": 0, "width": 0, "height": 0,
+                        "parsing_res_list": [], "layout_det_res": {"boxes": []}
                     },
                     "markdown": {"text": "", "images": {}},
                     "outputImages": {},
             os.unlink(tmp_path)
 def _parse_spotting(text: str) -> dict:
     try:
         return json.loads(text)
     return {
         "service": "PaddleOCR-VL-1.5 Bridge API",
         "status": "running",
+        "version": "1.1.0 (with confidence scores)",
         "endpoints": ["/health", "/api/ocr", "/api/parse", "/api/parse/markdown", "/v1/chat/completions", "/docs"]
     }
 async def ocr_endpoint(request: Request, authorization: Optional[str] = Header(None)):
     """
     Main OCR endpoint — compatible with the Gradio app.
+    Now includes per-token and per-word confidence scores.
     Body:
     {
         "promptLabel": "ocr|formula|table|chart|spotting|seal",
         "useChartRecognition": false,
         "useDocUnwarping": true,
+        "useDocOrientationClassify": true,
+        "includeConfidence": true  (default: true)
+    }
+    Response includes:
+    {
+        "result": {
+            "layoutParsingResults": [{
+                ...
+                "confidence": {
+                    "overall": {
+                        "mean_confidence": 0.95,
+                        "min_confidence": 0.42,
+                        "max_confidence": 1.0,
+                        "perplexity": 1.12,
+                        "total_tokens": 85
+                    },
+                    "tokens": [
+                        {"token": "Hello", "logprob": -0.02, "confidence": 0.98},
+                        ...
+                    ],
+                    "words": [
+                        {"word": "Hello", "confidence": 0.98, "avg_logprob": -0.02, "token_count": 1, "tokens": [...]},
+                        ...
+                    ]
+                }
+            }]
+        }
     }
     """
     verify_auth(authorization)
     use_chart = body.get("useChartRecognition", False)
     use_unwarp = body.get("useDocUnwarping", True)
     use_orient = body.get("useDocOrientationClassify", True)
+    include_confidence = body.get("includeConfidence", True)
     try:
         if use_layout:
+            return full_document_parsing(
+                file_data, use_chart, use_unwarp, use_orient,
+                include_confidence=include_confidence
+            )
         else:
             return element_level_recognition(file_data, prompt_label)
     except Exception as e:
     file: UploadFile = File(...),
     use_layout_detection: bool = True,
     prompt_label: str = "ocr",
+    include_confidence: bool = True,
     authorization: Optional[str] = Header(None)
 ):
+    """File upload endpoint with confidence scores."""
     verify_auth(authorization)
     content = await file.read()
     b64 = base64.b64encode(content).decode("utf-8")
     try:
         if use_layout_detection:
+            return full_document_parsing(b64, include_confidence=include_confidence)
         else:
             return element_level_recognition(b64, prompt_label)
     except Exception as e:
     b64 = base64.b64encode(content).decode("utf-8")
     try:
+        result = full_document_parsing(b64, include_confidence=False)
         pages = result.get("result", {}).get("layoutParsingResults", [])
         markdown_parts = [p.get("markdown", {}).get("text", "") for p in pages if p.get("markdown", {}).get("text")]
         return {
 @app.post("/v1/chat/completions")
 async def proxy_chat_completions(request: Request, authorization: Optional[str] = Header(None)):
+    """Proxy to vLLM for direct OpenAI-compatible calls (logprobs supported)."""
     verify_auth(authorization)
     import httpx
     print(f"""
 ╔══════════════════════════════════════════════════════════════╗
 ║     PaddleOCR-VL-1.5 Bridge Server (HF Spaces)             ║
+║     v1.1.0 — with per-token/word confidence scores          ║
 ╠══════════════════════════════════════════════════════════════╣
 ║  Bridge API:   http://0.0.0.0:{BRIDGE_PORT}                          ║
 ║  vLLM backend: {VLLM_SERVER_URL:<44s}║
 ║  Model:        {VLLM_MODEL_NAME:<44s}║
 ║  Auth:         {"ENABLED" if API_KEY else "DISABLED":<44s}║
 ╠══════════════════════════════════════════════════════════════╣
 ║  Endpoints:                                                  ║
 ║    GET  /health              - Health check                  ║
 ║    GET  /docs                - Swagger UI                    ║
+║    POST /api/ocr             - Gradio-compatible + confidence║
+║    POST /api/parse           - File upload + confidence      ║
 ║    POST /api/parse/markdown  - Simple markdown output        ║
 ║    POST /v1/chat/completions - vLLM proxy (OpenAI format)    ║
 ║    GET  /static/...          - Output images                 ║