Spaces:

outcomelabs
/

docling-parser

Running on T4

sidoutcome commited on Mar 13

Commit

16b2195

1 Parent(s): ba23da1

feat: v5.0.0 PaddleOCR-VL-1.5 + Gemini hybrid architecture

- Replace Qwen3-VL + Docling with PaddleOCR-VL-1.5 (0.9B params, #1 OmniDocBench 94.5%)
- Keep Gemini 3 Flash for table page enhancement only
- Split monolithic app.py into 8 focused modules
- Switch from A100 to T4 GPU (84% cost reduction)
- Native cross-page table merging via PP-DocLayoutV2
- Enhanced post-processing: footer/artifact removal, table cleanup

Files changed (11) hide show

Dockerfile +41 -38
app.py +51 -1556
auth.py +89 -0
config.py +37 -0
gemini.py +132 -0
models.py +40 -0
pipeline.py +210 -0
postprocess.py +341 -0
rendering.py +112 -0
requirements.txt +9 -15
start.sh +6 -83

Dockerfile CHANGED Viewed

@@ -1,9 +1,13 @@
-# Hugging Face Spaces Dockerfile for Docling VLM Document Parser API
-# GPU-accelerated document parsing with Docling + Qwen3-VL-30B-A3B via vLLM
-# Build: v2.0.0 - Docling with VLM backend for superior accuracy
-# Use vLLM base image with CUDA, PyTorch, and vLLM pre-installed
-FROM vllm/vllm-openai:v0.14.1
 USER root
@@ -12,18 +16,26 @@ RUN echo "========== BUILD STARTED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') ======
 # Install system dependencies
 RUN echo "========== STEP 1: Installing system dependencies ==========" && \
     apt-get update && apt-get install -y --no-install-recommends \
     # Fonts for document rendering
     fonts-noto-core \
     fonts-noto-cjk \
     fontconfig \
-    # Image processing
     libgl1 \
     libglib2.0-0 \
-    # PDF utilities
     poppler-utils \
     # Health checks
     curl \
     && fc-cache -fv && \
     rm -rf /var/lib/apt/lists/* && \
     echo "========== System dependencies installed =========="
@@ -33,24 +45,17 @@ RUN useradd -m -u 1000 user
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
-    VLM_MODEL=Qwen/Qwen3-VL-30B-A3B-Instruct \
-    VLM_HOST=127.0.0.1 \
-    VLM_PORT=8000 \
-    VLM_GPU_MEMORY_UTILIZATION=0.85 \
-    VLM_MAX_MODEL_LEN=65536 \
     IMAGES_SCALE=2.0 \
     MAX_FILE_SIZE_MB=1024 \
     HF_HOME=/home/user/.cache/huggingface \
-    TORCH_HOME=/home/user/.cache/torch \
     XDG_CACHE_HOME=/home/user/.cache \
     HOME=/home/user \
-    PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH \
-    LD_LIBRARY_PATH=/home/user/.local/lib/python3.12/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
 # Create cache directories with correct ownership
 RUN echo "========== STEP 2: Creating cache directories ==========" && \
     mkdir -p /home/user/.cache/huggingface \
-    /home/user/.cache/torch \
     /home/user/app && \
     chown -R user:user /home/user && \
     echo "========== Cache directories created =========="
@@ -62,30 +67,29 @@ WORKDIR /home/user/app
 # Copy requirements first for better caching
 COPY --chown=user:user requirements.txt .
-# Install Python dependencies
-RUN echo "========== STEP 3: Installing Python dependencies ==========" && \
-    pip install --user --upgrade pip && \
-    pip install --user nvidia-cudnn-cu12 && \
-    pip install --user -r requirements.txt && \
     echo "Installed packages:" && \
     pip list --user && \
     echo "========== Python dependencies installed =========="
-# Pre-download Qwen3-VL-30B-A3B model for vLLM (use default HF cache so vLLM resolves by repo ID)
-RUN echo "========== STEP 4: Pre-downloading Qwen3-VL-30B-A3B model ==========" && \
-    python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen3-VL-30B-A3B-Instruct')" && \
     echo "Model cache summary:" && \
     du -sh /home/user/.cache/huggingface 2>/dev/null || echo "  HF cache: (empty)" && \
-    echo "========== Qwen3-VL-30B-A3B model downloaded =========="
-# Pre-download Docling models
-RUN echo "========== STEP 5: Pre-downloading Docling models ==========" && \
-    python3 -c "from docling.document_converter import DocumentConverter; print('Downloading Docling models...'); converter = DocumentConverter(); print('Done')" && \
-    echo "Model cache summary:" && \
-    du -sh /home/user/.cache/huggingface 2>/dev/null || echo "  HF cache: (empty)" && \
-    du -sh /home/user/.cache/torch 2>/dev/null || echo "  Torch cache: (empty)" && \
     du -sh /home/user/.cache 2>/dev/null || echo "  Total cache: (empty)" && \
-    echo "========== Docling models downloaded =========="
 # Copy application code
 COPY --chown=user:user . .
@@ -95,13 +99,12 @@ RUN echo "========== STEP 6: Finalizing build ==========" && \
     echo "Files in app directory:" && ls -la /home/user/app/ && \
     echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="
-# Expose the port
 EXPOSE 7860
-# Health check (longer start-period for vLLM model loading)
-HEALTHCHECK --interval=30s --timeout=30s --start-period=600s --retries=5 \
     CMD curl -f http://localhost:7860/ || exit 1
-# Override vLLM entrypoint and use our startup script
-ENTRYPOINT []
 CMD ["/bin/bash", "/home/user/app/start.sh"]

+# Hugging Face Spaces Dockerfile for PaddleOCR-VL Document Parser API
+# GPU-accelerated document parsing with PaddleOCR-VL-1.5 + PaddlePaddle
+# Build: v5.0.0 - PaddleOCR-VL for high-quality OCR on Nvidia T4
+#
+# NOTE: Run with --shm-size 16g for PaddlePaddle shared memory:
+#   docker build -t hf-docling .
+#   docker run --gpus all --shm-size 16g -p 7860:7860 -e API_TOKEN=test hf-docling
+# CUDA 12.6 runtime with cuDNN (required by PaddlePaddle GPU)
+FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
 USER root
 # Install system dependencies
 RUN echo "========== STEP 1: Installing system dependencies ==========" && \
     apt-get update && apt-get install -y --no-install-recommends \
+    # Python 3.11
+    python3.11 \
+    python3.11-venv \
+    python3.11-dev \
+    python3-pip \
     # Fonts for document rendering
     fonts-noto-core \
     fonts-noto-cjk \
     fontconfig \
+    # Image processing (required by OpenCV)
     libgl1 \
     libglib2.0-0 \
+    # PDF utilities (required by pdf2image)
     poppler-utils \
     # Health checks
     curl \
     && fc-cache -fv && \
+    # Set python3.11 as default python3/python
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
     rm -rf /var/lib/apt/lists/* && \
     echo "========== System dependencies installed =========="
 # Set environment variables
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     IMAGES_SCALE=2.0 \
     MAX_FILE_SIZE_MB=1024 \
     HF_HOME=/home/user/.cache/huggingface \
     XDG_CACHE_HOME=/home/user/.cache \
     HOME=/home/user \
+    PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
 # Create cache directories with correct ownership
 RUN echo "========== STEP 2: Creating cache directories ==========" && \
     mkdir -p /home/user/.cache/huggingface \
+    /home/user/.cache/paddleocr \
     /home/user/app && \
     chown -R user:user /home/user && \
     echo "========== Cache directories created =========="
 # Copy requirements first for better caching
 COPY --chown=user:user requirements.txt .
+# Install PaddlePaddle GPU (must be installed before paddleocr)
+RUN echo "========== STEP 3: Installing PaddlePaddle GPU ==========" && \
+    python -m pip install --user --upgrade pip && \
+    python -m pip install --user paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ && \
+    echo "PaddlePaddle version:" && \
+    python -c "import paddle; print(paddle.__version__); print('CUDA:', paddle.is_compiled_with_cuda())" && \
+    echo "========== PaddlePaddle GPU installed =========="
+# Install Python dependencies from requirements.txt
+RUN echo "========== STEP 4: Installing Python dependencies ==========" && \
+    python -m pip install --user -r requirements.txt && \
     echo "Installed packages:" && \
     pip list --user && \
     echo "========== Python dependencies installed =========="
+# Pre-download PaddleOCR-VL-1.5 model at build time (avoids download on first request)
+RUN echo "========== STEP 5: Pre-downloading PaddleOCR-VL-1.5 model ==========" && \
+    python -c "from paddleocr import PaddleOCRVL; PaddleOCRVL()" && \
     echo "Model cache summary:" && \
+    du -sh /home/user/.cache/paddleocr 2>/dev/null || echo "  PaddleOCR cache: (empty)" && \
     du -sh /home/user/.cache/huggingface 2>/dev/null || echo "  HF cache: (empty)" && \
     du -sh /home/user/.cache 2>/dev/null || echo "  Total cache: (empty)" && \
+    echo "========== PaddleOCR-VL-1.5 model downloaded =========="
 # Copy application code
 COPY --chown=user:user . .
     echo "Files in app directory:" && ls -la /home/user/app/ && \
     echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="
+# Expose the port (HF Spaces standard)
 EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=120s --retries=5 \
     CMD curl -f http://localhost:7860/ || exit 1
+# Single-process FastAPI app (no vLLM sidecar needed)
 CMD ["/bin/bash", "/home/user/app/start.sh"]

app.py CHANGED Viewed

@@ -1,1512 +1,54 @@
 """
-Docling VLM Parser API v4.0.0
-A FastAPI service using a VLM + Gemini hybrid architecture for document parsing:
-  Pass 1 (GPU):  Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast text extraction)
-  Detect:        Identify pages with tables from VLM markdown output
-  Pass 2 (API):  Gemini 2.5 Flash on table pages ONLY (superior table extraction)
-  Merge:         VLM text for non-table pages + Gemini output for table pages
-  Post:          Cross-page artifact removal, table cleanup, deduplication
-v4.0.0 — Gemini table extraction:
-  - Quality: Gemini 2.5 Flash replaces Docling TableFormer for table pages
-  - Quality: Table pages use Gemini's full output (text + tables) for best quality
-  - Speed: No more CPU-bound Docling pipeline — Gemini API is fast
-  - Quality: DPI 200 for clear page images sent to Gemini
-  - Quality: Post-processing removes cross-page artifacts, deduplicates, cleans tables
 """
 import asyncio
-import base64
-import io
-import ipaddress
-import logging
-import os
 import re
-import secrets
 import shutil
-import socket
 import tempfile
 import time
-import zipfile
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from contextlib import asynccontextmanager
 from pathlib import Path
-from typing import BinaryIO, Optional, Union
-from urllib.parse import urlparse
 from uuid import uuid4
-import cv2
 import httpx
-import torch
 from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
-from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
-from pdf2image import convert_from_path
-from pydantic import BaseModel
-# Docling imports
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import PictureItem, TableItem
-from docling.datamodel.pipeline_options import (
-    AcceleratorOptions,
-    PdfPipelineOptions,
-    RapidOcrOptions,
-    TableFormerMode,
-)
-from docling.document_converter import DocumentConverter, PdfFormatOption
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s | %(levelname)-8s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-logger = logging.getLogger("docling-parser")
-# Security
-API_TOKEN = os.getenv("API_TOKEN")
-security = HTTPBearer()
-def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
-    """Verify the API token from Authorization header."""
-    if not API_TOKEN:
-        raise HTTPException(
-            status_code=500,
-            detail="No API token configured on server",
-        )
-    token = credentials.credentials
-    if not secrets.compare_digest(token, API_TOKEN):
-        raise HTTPException(
-            status_code=401,
-            detail="Invalid API token",
-        )
-    return token
-# VLM Configuration
-VLM_MODEL = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-30B-A3B-Instruct")
-VLM_HOST = os.getenv("VLM_HOST", "127.0.0.1")
-VLM_PORT = os.getenv("VLM_PORT", "8000")
-IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
-MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
-MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
-VLM_TIMEOUT = float(os.getenv("VLM_TIMEOUT", "300"))
-VLM_CONCURRENCY = int(os.getenv("VLM_CONCURRENCY", "4"))
-RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
-# Gemini API Configuration (for table page extraction)
-GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
-GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
-GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
-GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "4"))
-# Blocked hostnames for SSRF protection
-BLOCKED_HOSTNAMES = {
-    "localhost",
-    "metadata",
-    "metadata.google.internal",
-    "metadata.google",
-    "169.254.169.254",
-    "fd00:ec2::254",
-}
-# Global converter instance (initialized on startup)
-_converter: Optional[DocumentConverter] = None
-def _get_device() -> str:
-    """Get the best available device for processing."""
-    if torch.cuda.is_available():
-        return "cuda"
-    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-        return "mps"
-    return "cpu"
-def _validate_url(url: str) -> None:
-    """Validate URL to prevent SSRF attacks."""
-    try:
-        parsed = urlparse(url)
-    except Exception as e:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Invalid URL format: {str(e)}",
-        )
-    if parsed.scheme not in ("http", "https"):
-        raise HTTPException(
-            status_code=400,
-            detail=f"Invalid URL scheme '{parsed.scheme}'. Only http and https are allowed.",
-        )
-    hostname = parsed.hostname
-    if not hostname:
-        raise HTTPException(
-            status_code=400,
-            detail="Invalid URL: missing hostname.",
-        )
-    hostname_lower = hostname.lower()
-    if hostname_lower in BLOCKED_HOSTNAMES:
-        raise HTTPException(
-            status_code=400,
-            detail="Access to internal/metadata services is not allowed.",
-        )
-    blocked_patterns = ["metadata", "internal", "localhost", "127.0.0.1", "::1"]
-    for pattern in blocked_patterns:
-        if pattern in hostname_lower:
-            raise HTTPException(
-                status_code=400,
-                detail="Access to internal/metadata services is not allowed.",
-            )
-    try:
-        ip_str = socket.gethostbyname(hostname)
-        ip = ipaddress.ip_address(ip_str)
-    except socket.gaierror:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Could not resolve hostname: {hostname}",
-        )
-    except ValueError as e:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Invalid IP address resolved: {str(e)}",
-        )
-    if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
-        raise HTTPException(
-            status_code=400,
-            detail="Access to private/internal IP addresses is not allowed.",
-        )
-def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
-    """Sync helper to save uploaded file to disk."""
-    with open(input_path, "wb") as f:
-        shutil.copyfileobj(file_obj, f)
-def _save_downloaded_content(input_path: Path, content: bytes) -> None:
-    """Sync helper to save downloaded content to disk."""
-    with open(input_path, "wb") as f:
-        f.write(content)
-# ---------------------------------------------------------------------------
-# Pydantic Models
-# ---------------------------------------------------------------------------
-class ParseResponse(BaseModel):
-    """Response model for document parsing."""
-    success: bool
-    markdown: Optional[str] = None
-    json_content: Optional[Union[dict, list]] = None
-    images_zip: Optional[str] = None
-    image_count: int = 0
-    error: Optional[str] = None
-    pages_processed: int = 0
-    device_used: Optional[str] = None
-    vlm_model: Optional[str] = None
-class HealthResponse(BaseModel):
-    """Health check response."""
-    status: str
-    version: str
-    device: str
-    gpu_name: Optional[str] = None
-    vlm_model: str = ""
-    vlm_status: str = "unknown"
-    images_scale: float = 2.0
-class URLParseRequest(BaseModel):
-    """Request model for URL-based parsing."""
-    url: str
-    output_format: str = "markdown"
-    images_scale: Optional[float] = None
-    start_page: int = 0
-    end_page: Optional[int] = None
-    include_images: bool = False
-# ---------------------------------------------------------------------------
-# OpenCV Image Preprocessing (CLAHE only — fast)
-# ---------------------------------------------------------------------------
-def _preprocess_image_for_ocr(image_path: str) -> str:
-    """Enhance image quality for better OCR accuracy.
-    Applies CLAHE contrast enhancement only (fast).
-    Denoising was removed in v3.2.1 — it added ~10s/page with minimal
-    benefit for VLM-based OCR which handles noise well.
-    """
-    img = cv2.imread(image_path)
-    if img is None:
-        return image_path
-    # CLAHE contrast enhancement on L channel
-    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
-    l, a, b = cv2.split(lab)
-    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
-    l = clahe.apply(l)
-    lab = cv2.merge([l, a, b])
-    img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
-    cv2.imwrite(image_path, img)
-    return image_path
-# ---------------------------------------------------------------------------
-# VLM OCR with retry
-# ---------------------------------------------------------------------------
-# Strip Qwen3 <think>...</think> reasoning blocks
-_THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
-# Post-processing patterns for VLM output cleanup
-_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
-_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
-_HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
-_PAGE_N_PATTERN = re.compile(r"^\s*Page\s+\d+\s*$\n?", re.MULTILINE)
-def _clean_vlm_output(content: str) -> str:
-    """Post-process VLM output to clean artifacts.
-    Removes: code fences, HTML comments, 'Page N' artifacts,
-    and converts any remaining LaTeX tables to markdown format.
-    """
-    # Strip <think> blocks
-    content = _THINK_PATTERN.sub("", content).strip()
-    # Strip code fence wrappers
-    content = _CODE_FENCE_PATTERN.sub("", content)
-    content = _CODE_FENCE_END.sub("", content)
-    # Strip HTML comments (VLM sometimes adds coordinate annotations)
-    content = _HTML_COMMENT_PATTERN.sub("", content)
-    # Strip "Page N" artifacts
-    content = _PAGE_N_PATTERN.sub("", content)
-    # Fix escaped quotes (VLM sometimes escapes them unnecessarily)
-    content = content.replace('\\"', '"')
-    # Convert LaTeX tables to markdown if VLM ignores the prompt
-    content = _convert_latex_tables_to_markdown(content)
-    return content.strip()
-def _convert_latex_tables_to_markdown(text: str) -> str:
-    """Convert LaTeX tabular environments to markdown pipe tables."""
-    latex_pattern = re.compile(
-        r"\\begin\{tabular\}\{[^}]*\}(.*?)\\end\{tabular\}", re.DOTALL
-    )
-    def _latex_to_md(match: re.Match) -> str:
-        body = match.group(1)
-        # Remove \hline
-        body = re.sub(r"\\hline\s*", "", body)
-        # Split on \\
-        rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
-        if not rows:
-            return match.group(0)
-        md_rows = []
-        for i, row in enumerate(rows):
-            cells = [c.strip() for c in row.split("&")]
-            md_row = "| " + " | ".join(cells) + " |"
-            md_rows.append(md_row)
-            if i == 0:
-                # Add separator after header
-                sep = "| " + " | ".join(["---"] * len(cells)) + " |"
-                md_rows.append(sep)
-        return "\n".join(md_rows)
-    return latex_pattern.sub(_latex_to_md, text)
-# ---------------------------------------------------------------------------
-# Post-Processing: Cross-page artifact removal (applied AFTER page merge)
-# ---------------------------------------------------------------------------
-# Day-of-week date lines (e.g., "Thursday, October 31, 2024")
-_STANDALONE_DATE = re.compile(
-    r"^\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
-    r"(?:January|February|March|April|May|June|July|August|September|"
-    r"October|November|December)\s+\d{1,2},\s+\d{4}\s*$",
-    re.MULTILINE,
-)
-# Standalone time (e.g., "11:30 AM")
-_STANDALONE_TIME = re.compile(r"^\s*\d{1,2}:\d{2}\s*(?:AM|PM)\s*$", re.MULTILINE)
-# Page footer patterns: "N | address" or "N address N" (e.g., "2 | 8575 W Golf Rd, Niles, IL 60714 | 3")
-_PAGE_FOOTER = re.compile(
-    r"^\s*\d{1,3}\s*\|?\s*\d{2,5}\s+\w.*(?:Rd|St|Ave|Blvd|Dr|Ln|Way|Ct)\b.*\d{5}.*$",
-    re.MULTILINE,
-)
-# Standalone page number lines (e.g., "12" alone on a line)
-_STANDALONE_PAGE_NUM = re.compile(r"^\s*\d{1,3}\s*$", re.MULTILINE)
-# Numbered section pattern: "N. TITLE" where N is 1-99 and TITLE is mostly uppercase
-_NUMBERED_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][A-Z\s\-/&,]+(?:\.\s*)?)")
-# Table row with ALL empty cells (e.g., "| | | | |")
-_EMPTY_TABLE_ROW = re.compile(r"^\|(?:\s*\|)+\s*$", re.MULTILINE)
-# Trailing empty cells in a table row (e.g., "| data | data | | | |")
-_TRAILING_EMPTY_CELLS = re.compile(r"(?:\s*\|\s*){2,}\s*$")
-# Table separator row (e.g., "|---|---|---|")
-_TABLE_SEP_ROW = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$")
-def _post_process_merged_markdown(content: str) -> str:
-    """Post-process merged multi-page markdown to fix cross-page artifacts.
-    Applied after all pages are concatenated. Fixes:
-    - Duplicate document headings (VLM re-extracts page headers)
-    - Duplicate short metadata lines (subtitles, dates repeated per page)
-    - Page footer/header artifacts (standalone dates, times, page numbers)
-    - Numbered section heading normalization (consistent ## levels)
-    - Table artifacts (empty rows, trailing empty cells)
-    - Cross-page table continuations (merge split tables)
-    - Excessive whitespace
-    """
-    content = _deduplicate_headings(content)
-    content = _deduplicate_short_blocks(content)
-    content = _remove_page_boundary_artifacts(content)
-    content = _normalize_numbered_headings(content)
-    content = _clean_table_artifacts(content)
-    content = _merge_split_tables(content)
-    # Normalize runs of 4+ newlines to 3
-    content = re.sub(r"\n{4,}", "\n\n\n", content)
-    return content.strip()
-def _deduplicate_headings(content: str) -> str:
-    """Remove duplicate heading lines, keeping only the first occurrence.
-    When VLM processes each page, it may re-extract page headers/document titles.
-    This removes exact duplicate headings while preserving table rows and body text.
-    """
-    lines = content.split("\n")
-    seen_headings: set[str] = set()
-    result: list[str] = []
-    for line in lines:
-        stripped = line.strip()
-        if stripped.startswith("#"):
-            # Normalize heading for comparison (lowercase, strip trailing #)
-            key = stripped.lstrip("#").strip().lower()
-            if key and key in seen_headings:
-                continue  # Skip duplicate heading
-            if key:
-                seen_headings.add(key)
-        result.append(line)
-    return "\n".join(result)
-def _deduplicate_short_blocks(content: str) -> str:
-    """Remove duplicate short text blocks that repeat across pages.
-    When VLM processes each page, it may re-extract document subtitles,
-    metadata lines, and other short repeating text. This removes exact
-    duplicates of short non-table blocks (< 120 chars).
-    """
-    blocks = content.split("\n\n")
-    seen: set[str] = set()
-    result: list[str] = []
-    for block in blocks:
-        stripped = block.strip()
-        if not stripped:
-            result.append(block)
-            continue
-        # Only deduplicate short, non-table, non-heading blocks
-        is_table = stripped.startswith("|") and "|" in stripped[1:]
-        is_heading = stripped.startswith("#")
-        if is_table or is_heading or len(stripped) > 120:
-            result.append(block)
-            continue
-        key = stripped.lower()
-        if key in seen:
-            continue  # Skip duplicate short block
-        seen.add(key)
-        result.append(block)
-    return "\n\n".join(result)
-def _remove_page_boundary_artifacts(content: str) -> str:
-    """Remove page footer/header artifacts like standalone dates, times, page numbers, and footers."""
-    content = _STANDALONE_DATE.sub("", content)
-    content = _STANDALONE_TIME.sub("", content)
-    content = _PAGE_FOOTER.sub("", content)
-    content = _STANDALONE_PAGE_NUM.sub("", content)
-    return content
-def _normalize_numbered_headings(content: str) -> str:
-    """Normalize numbered section headings to consistent ## level.
-    VLM inconsistently formats numbered sections like "3. OCCUPANCY" —
-    some get ## headings, some are plain text. This detects the pattern
-    and ensures all numbered sections at the same level use ## headings.
-    """
-    lines = content.split("\n")
-    result: list[str] = []
-    # First pass: detect which numbered sections exist and their heading status
-    sections_with_heading: set[int] = set()
-    sections_without_heading: set[int] = set()
-    for line in lines:
-        stripped = line.strip()
-        # Already a heading like "## 3. OCCUPANCY"
-        heading_match = re.match(r"^#{1,3}\s+(\d{1,2})\.\s+[A-Z]", stripped)
-        if heading_match:
-            sections_with_heading.add(int(heading_match.group(1)))
-            continue
-        # Plain text like "3. OCCUPANCY. Tenant shall..."
-        plain_match = _NUMBERED_SECTION.match(stripped)
-        if plain_match:
-            sections_without_heading.add(int(plain_match.group(1)))
-    # If there's a mix of headed and non-headed numbered sections, normalize
-    if sections_with_heading and sections_without_heading:
-        for i, line in enumerate(lines):
-            stripped = line.strip()
-            # Check if this is a non-headed numbered section that should be a heading
-            plain_match = _NUMBERED_SECTION.match(stripped)
-            if plain_match:
-                section_num = int(plain_match.group(1))
-                if section_num in sections_without_heading:
-                    # Check that it looks like a section start (followed by text)
-                    # Split at the first sentence end to make the heading
-                    # Extract just "N. TITLE." as heading, keep body text
-                    title_end = plain_match.end()
-                    title = stripped[:title_end].rstrip(".")
-                    body = stripped[title_end:].strip()
-                    if body:
-                        result.append(f"## {title}")
-                        result.append(body)
-                    else:
-                        result.append(f"## {title}")
-                    continue
-            result.append(line)
-    else:
-        result = lines
-    return "\n".join(result)
-def _clean_table_artifacts(content: str) -> str:
-    """Clean table formatting artifacts.
-    - Removes table rows where ALL cells are empty
-    - Strips trailing empty cells from table rows
-    - Removes orphaned separator rows not preceded by a header
-    """
-    lines = content.split("\n")
-    result: list[str] = []
-    for i, line in enumerate(lines):
-        stripped = line.strip()
-        # Skip completely empty table rows (| | | | |)
-        if _EMPTY_TABLE_ROW.match(stripped):
-            continue
-        # Clean trailing empty cells from table data rows
-        if stripped.startswith("|") and "|" in stripped[1:]:
-            # Don't touch separator rows
-            if not _TABLE_SEP_ROW.match(stripped):
-                # Remove trailing empty cells
-                cleaned = _TRAILING_EMPTY_CELLS.sub(" |", stripped)
-                result.append(cleaned)
-                continue
-        result.append(line)
-    return "\n".join(result)
-def _is_table_line(line: str) -> bool:
-    """Check if a line is a markdown table row or separator."""
-    s = line.strip()
-    return bool(s.startswith("|") and s.endswith("|") and s.count("|") >= 3)
-def _count_columns(line: str) -> int:
-    """Count the number of columns in a table row."""
-    s = line.strip()
-    if not s.startswith("|"):
-        return 0
-    # Split by | and count non-boundary segments
-    parts = s.split("|")
-    # First and last are empty strings from leading/trailing |
-    return max(0, len(parts) - 2)
-def _merge_split_tables(content: str) -> str:
-    """Merge table continuations that were split across pages.
-    Detects when non-table content (whitespace, duplicate metadata) separates
-    what should be a single table, and merges the data rows.
-    """
-    lines = content.split("\n")
-    result: list[str] = []
-    i = 0
-    while i < len(lines):
-        result.append(lines[i])
-        i += 1
-        # Check if we just appended a table row and the next chunk looks like
-        # a table continuation (another table with similar column count)
-        if not _is_table_line(result[-1]):
-            continue
-        last_table_cols = _count_columns(result[-1])
-        if last_table_cols < 2:
-            continue
-        # Look ahead past empty lines / short non-table lines
-        j = i
-        gap_lines: list[str] = []
-        while j < len(lines):
-            s = lines[j].strip()
-            if s == "":
-                gap_lines.append(lines[j])
-                j += 1
-                continue
-            break
-        if j >= len(lines):
-            continue
-        # Check if the next non-empty line starts a table
-        if not _is_table_line(lines[j]):
-            continue
-        next_table_cols = _count_columns(lines[j])
-        # If column counts are close (within 30%), it's likely a continuation
-        if last_table_cols < 2 or next_table_cols < 2:
-            continue
-        ratio = min(last_table_cols, next_table_cols) / max(last_table_cols, next_table_cols)
-        if ratio < 0.7:
-            continue
-        # Check if the new table starts with header + separator (indicating
-        # the VLM re-extracted headers on the next page)
-        has_new_header = False
-        if _is_table_line(lines[j]):
-            # Look for a separator row in the next 1-2 lines
-            for k in range(j + 1, min(j + 3, len(lines))):
-                if _TABLE_SEP_ROW.match(lines[k].strip()):
-                    has_new_header = True
-                    break
-        if has_new_header:
-            # Skip the gap, skip the duplicate header + separator, keep data rows
-            # Find the separator row
-            skip_to = j
-            while skip_to < len(lines):
-                if _TABLE_SEP_ROW.match(lines[skip_to].strip()):
-                    skip_to += 1  # Skip past separator
-                    break
-                skip_to += 1
-            i = skip_to
-        else:
-            # No header — just skip the gap and append the continuation rows
-            i = j
-    return "\n".join(result)
-def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
-    """Send a page image to Qwen3-VL via vLLM for text extraction.
-    Includes retry logic: on timeout/failure, retries once with longer timeout.
-    Strips <think> reasoning tokens from Qwen3 output.
-    """
-    b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
-    payload = {
-        "model": VLM_MODEL,
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/png;base64,{b64_image}"},
-                    },
-                    {
-                        "type": "text",
-                        "text": (
-                            "Convert this document page to markdown format.\n\n"
-                            "Rules:\n"
-                            "- Extract ALL text content exactly as written\n"
-                            "- Use ## headings for section titles\n"
-                            "- Preserve lists, paragraphs, and document structure\n"
-                            "- For tables:\n"
-                            "  * Read EVERY column header exactly as printed — do NOT skip, rename, or reorder columns\n"
-                            "  * Include ALL columns even if the table is very wide\n"
-                            "  * Format as markdown tables with | delimiters and --- separator rows\n"
-                            "  * Each data row must have the same number of cells as the header\n"
-                            "  * NEVER use LaTeX (no \\begin{tabular}, no \\hline, no &)\n"
-                            "- NEVER wrap output in code fences (no ```)\n"
-                            "- NEVER add HTML comments or coordinate annotations\n"
-                            "- Do NOT include page headers, footers, page numbers, or timestamps that repeat on every page\n"
-                            "- For handwritten text, transcribe as accurately as possible\n"
-                            "- Output ONLY the extracted markdown content, nothing else"
-                        ),
-                    },
-                ],
-            }
-        ],
-        "max_tokens": 32768,
-        "temperature": 0.1,
-        # Disable Qwen3 thinking mode to avoid <think> tokens
-        "chat_template_kwargs": {"enable_thinking": False},
-    }
-    url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
-    # Try with primary timeout, then retry once with extended timeout
-    for attempt, timeout in enumerate([VLM_TIMEOUT, VLM_TIMEOUT * 1.5], start=1):
-        try:
-            response = httpx.post(url, json=payload, timeout=timeout)
-            if response.status_code != 200:
-                try:
-                    err = response.json()
-                    msg = err.get("message", err.get("detail", str(err)[:300]))
-                except Exception:
-                    msg = response.text[:300]
-                logger.error(f"[{request_id}] vLLM error ({response.status_code}) page {page_no}: {msg}")
-                if attempt == 1:
-                    logger.info(f"[{request_id}] Retrying page {page_no}...")
-                    continue
-                response.raise_for_status()
-            result = response.json()
-            choices = result.get("choices")
-            if not choices:
-                raise ValueError("vLLM returned no choices")
-            content = choices[0].get("message", {}).get("content")
-            if content is None:
-                raise ValueError("vLLM response missing content")
-            # Clean VLM output (strip think blocks, code fences, HTML comments, convert LaTeX tables)
-            content = _clean_vlm_output(content)
-            return content
-        except (httpx.TimeoutException, httpx.ConnectError) as e:
-            if attempt == 1:
-                logger.warning(
-                    f"[{request_id}] VLM attempt {attempt} failed on page {page_no}: {e}. Retrying..."
-                )
-                continue
-            raise
-    raise RuntimeError(f"VLM failed after 2 attempts on page {page_no}")
-def _vlm_extract_tables(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> Optional[str]:
-    """Send a page image to VLM with a table-focused prompt for better table extraction.
-    Used as a second pass on pages where tables were detected in the first pass.
-    Returns extracted tables as markdown, or None on failure.
-    """
-    b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
-    payload = {
-        "model": VLM_MODEL,
-        "messages": [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/png;base64,{b64_image}"},
-                    },
-                    {
-                        "type": "text",
-                        "text": (
-                            "Extract ONLY the tables from this document page as markdown.\n\n"
-                            "Rules:\n"
-                            "- Read every column header EXACTLY as printed on the page\n"
-                            "- Include ALL columns — do NOT skip any, even if the table is very wide\n"
-                            "- Each data row must have the same number of | cells as the header row\n"
-                            "- Use | delimiters and --- separator rows\n"
-                            "- Preserve all numbers, text, and formatting exactly\n"
-                            "- Add spaces between words — never concatenate (e.g., 'CAP Rate' not 'CAPRate')\n"
-                            "- If multiple tables exist, separate them with a blank line\n"
-                            "- Include a short heading (## or ###) before each table if one is visible\n"
-                            "- NEVER use LaTeX table syntax\n"
-                            "- Output ONLY the markdown tables, nothing else"
-                        ),
-                    },
-                ],
-            }
-        ],
-        "max_tokens": 32768,
-        "temperature": 0.1,
-        "chat_template_kwargs": {"enable_thinking": False},
-    }
-    url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
-    try:
-        response = httpx.post(url, json=payload, timeout=VLM_TIMEOUT)
-        if response.status_code != 200:
-            logger.warning(f"[{request_id}] Table re-prompt failed for page {page_no}: {response.status_code}")
-            return None
-        result = response.json()
-        choices = result.get("choices")
-        if not choices:
-            return None
-        content = choices[0].get("message", {}).get("content")
-        if content is None:
-            return None
-        content = _clean_vlm_output(content)
-        return content if content.strip() else None
-    except Exception as e:
-        logger.warning(f"[{request_id}] Table re-prompt error for page {page_no}: {e}")
-        return None
-# ---------------------------------------------------------------------------
-# Table Detection from VLM Output
-# ---------------------------------------------------------------------------
-# Markdown table separator: | --- | --- | or |:---:|---:|
-_MD_TABLE_SEPARATOR = re.compile(
-    r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE
 )
-# LaTeX table markers (fallback if VLM ignores markdown instruction)
-_LATEX_TABLE_PATTERN = re.compile(r"\\begin\{tabular\}")
-def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
-    """Detect pages containing tables from VLM markdown output.
-    Checks for both markdown table separators and LaTeX tabular markers.
-    """
-    table_pages: set[int] = set()
-    for page_no, text in vlm_page_texts.items():
-        if text and (
-            _MD_TABLE_SEPARATOR.search(text) or _LATEX_TABLE_PATTERN.search(text)
-        ):
-            table_pages.add(page_no)
-    return table_pages
-# ---------------------------------------------------------------------------
-# Gemini API: Table Page Extraction
-# ---------------------------------------------------------------------------
-def _gemini_extract_page(
-    page_image_bytes: bytes, request_id: str = "", page_no: int = 0
-) -> Optional[str]:
-    """Send a page image to Gemini 2.5 Flash for high-quality extraction.
-    Used for table pages where VLM output is insufficient.
-    Returns the full page markdown (text + tables), or None on failure.
-    """
-    if not GEMINI_API_KEY:
-        logger.warning(f"[{request_id}] GEMINI_API_KEY not set — skipping Gemini extraction")
-        return None
-    b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
-    payload = {
-        "contents": [
-            {
-                "parts": [
-                    {
-                        "inline_data": {
-                            "mime_type": "image/png",
-                            "data": b64_image,
-                        }
-                    },
-                    {
-                        "text": (
-                            "Convert this document page to clean markdown format.\n\n"
-                            "Rules:\n"
-                            "- Extract ALL text content exactly as written\n"
-                            "- Use ## headings for section titles\n"
-                            "- Preserve lists, paragraphs, and document structure\n"
-                            "- For tables:\n"
-                            "  * Read EVERY column header exactly as printed\n"
-                            "  * Include ALL columns even if the table is very wide\n"
-                            "  * Format as markdown tables with | delimiters and --- separator rows\n"
-                            "  * Each data row must have the same number of cells as the header\n"
-                            "  * Preserve multi-line cell content on separate lines within the cell\n"
-                            "- Do NOT wrap output in code fences\n"
-                            "- Do NOT add image descriptions or [Image:] tags\n"
-                            "- Do NOT include page headers, footers, or page numbers\n"
-                            "- Output ONLY the extracted markdown content"
-                        ),
-                    },
-                ],
-            }
-        ],
-        "generationConfig": {
-            "temperature": 0.1,
-            "maxOutputTokens": 32768,
-        },
-    }
-    url = (
-        f"https://generativelanguage.googleapis.com/v1beta/models/"
-        f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
-    )
-    for attempt in range(1, 3):
-        try:
-            timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
-            response = httpx.post(url, json=payload, timeout=timeout)
-            if response.status_code == 429:
-                # Rate limited — wait and retry
-                logger.warning(
-                    f"[{request_id}] Gemini rate limited on page {page_no + 1}, "
-                    f"attempt {attempt}. Waiting 5s..."
-                )
-                time.sleep(5)
-                continue
-            if response.status_code != 200:
-                try:
-                    err = response.json()
-                    msg = str(err.get("error", {}).get("message", str(err)[:300]))
-                except Exception:
-                    msg = response.text[:300]
-                logger.error(
-                    f"[{request_id}] Gemini error ({response.status_code}) "
-                    f"page {page_no + 1}: {msg}"
-                )
-                if attempt == 1:
-                    continue
-                return None
-            result = response.json()
-            candidates = result.get("candidates", [])
-            if not candidates:
-                logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
-                return None
-            parts = candidates[0].get("content", {}).get("parts", [])
-            if not parts:
-                return None
-            content = parts[0].get("text", "")
-            # Clean up: strip code fences if Gemini wraps output
-            content = _CODE_FENCE_PATTERN.sub("", content)
-            content = _CODE_FENCE_END.sub("", content)
-            return content.strip() if content.strip() else None
-        except (httpx.TimeoutException, httpx.ConnectError) as e:
-            if attempt == 1:
-                logger.warning(
-                    f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}. Retrying..."
-                )
-                continue
-            logger.error(f"[{request_id}] Gemini failed after 2 attempts on page {page_no + 1}: {e}")
-            return None
-    return None
-# ---------------------------------------------------------------------------
-# Mini-PDF Extraction (pypdf) — kept for fallback Docling path
-# ---------------------------------------------------------------------------
-def _extract_pages_to_pdf(
-    input_path: Path, page_numbers: list[int], request_id: str
-) -> tuple[Path, dict[int, int]]:
-    """Extract specific pages from a PDF into a mini-PDF using pypdf.
-    Args:
-        input_path: Path to the original PDF
-        page_numbers: 0-indexed page numbers to extract
-        request_id: Request ID for logging
-    Returns:
-        (mini_pdf_path, page_map) where page_map maps Docling 1-indexed
-        page numbers in the mini-PDF back to 0-indexed original page numbers.
-    """
-    from pypdf import PdfReader, PdfWriter
-    reader = PdfReader(str(input_path))
-    writer = PdfWriter()
-    # page_map: {docling_page_no (1-indexed in mini-PDF) → original_page_no (0-indexed)}
-    page_map: dict[int, int] = {}
-    for idx, orig_page in enumerate(sorted(page_numbers)):
-        if orig_page < len(reader.pages):
-            writer.add_page(reader.pages[orig_page])
-            page_map[idx + 1] = orig_page  # Docling uses 1-indexed pages
-        else:
-            logger.warning(
-                f"[{request_id}] Page {orig_page} out of range (total: {len(reader.pages)})"
-            )
-    mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
-    with open(mini_pdf_path, "wb") as f:
-        writer.write(f)
-    logger.info(
-        f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original"
-    )
-    return mini_pdf_path, page_map
-# ---------------------------------------------------------------------------
-# Table Extraction from Docling
-# ---------------------------------------------------------------------------
-def _extract_table_markdowns(doc, page_map: dict[int, int]) -> dict[int, list[str]]:
-    """Extract table markdown from Docling document, keyed by ORIGINAL page number.
-    Uses page_map to translate from Docling's 1-indexed mini-PDF pages
-    back to the original 0-indexed page numbers.
-    """
-    tables_by_page: dict[int, list[str]] = {}
-    for element, _ in doc.iterate_items():
-        if isinstance(element, TableItem):
-            docling_page = element.prov[0].page_no if element.prov else -1
-            # Translate mini-PDF page → original page
-            orig_page = page_map.get(docling_page, docling_page - 1)
-            table_md = element.export_to_markdown(doc=doc)
-            if orig_page not in tables_by_page:
-                tables_by_page[orig_page] = []
-            tables_by_page[orig_page].append(table_md)
-    return tables_by_page
-def _extract_docling_page_markdown(doc, page_map: dict[int, int]) -> dict[int, str]:
-    """Extract complete per-page markdown from Docling document.
-    Returns dict mapping ORIGINAL page numbers (0-indexed) to complete markdown
-    content including text, headings, and tables as Docling understands them.
-    This is used as the PRIMARY output for table pages, replacing the VLM text
-    entirely for better table structure.
-    """
-    pages: dict[int, list[str]] = {}
-    for element, _ in doc.iterate_items():
-        if not element.prov:
-            continue
-        docling_page = element.prov[0].page_no
-        orig_page = page_map.get(docling_page, docling_page - 1)
-        md = element.export_to_markdown(doc=doc)
-        if md and md.strip():
-            if orig_page not in pages:
-                pages[orig_page] = []
-            pages[orig_page].append(md)
-    return {pg: "\n\n".join(parts) for pg, parts in pages.items()}
-# ---------------------------------------------------------------------------
-# Merge: VLM Text + TableFormer Tables
-# ---------------------------------------------------------------------------
-# Consecutive lines with | delimiters (markdown tables)
-_VLM_TABLE_BLOCK = re.compile(r"((?:^\|[^\n]+\|$\n?)+)", re.MULTILINE)
-# LaTeX table blocks
-_VLM_LATEX_BLOCK = re.compile(
-    r"(\\begin\{tabular\}.*?\\end\{tabular\})", re.DOTALL
 )
-def _extract_table_blocks(text: str) -> list[str]:
-    """Extract individual table blocks from markdown text.
-    Returns a list of table block strings (header + separator + data rows).
-    """
-    tables: list[str] = []
-    md_matches = list(_VLM_TABLE_BLOCK.finditer(text))
-    latex_matches = list(_VLM_LATEX_BLOCK.finditer(text))
-    # Combine and deduplicate by position
-    all_matches = [(m.start(), m.end(), m.group(0)) for m in md_matches]
-    all_matches += [(m.start(), m.end(), m.group(0)) for m in latex_matches]
-    all_matches.sort(key=lambda x: x[0])
-    last_end = -1
-    for start, end, content in all_matches:
-        if start >= last_end:
-            tables.append(content.strip())
-            last_end = end
-    return tables
-def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list[str]) -> str:
-    """Replace VLM's table sections with more accurate tables.
-    Handles both markdown pipe tables and LaTeX tabular blocks in VLM output.
-    Used for both TableFormer tables (Pass 2) and re-prompted VLM tables (Pass 1.5).
-    """
-    if not table_markdowns:
-        return vlm_text
-    # Find all table blocks (markdown first, then LaTeX)
-    md_tables = list(_VLM_TABLE_BLOCK.finditer(vlm_text))
-    latex_tables = list(_VLM_LATEX_BLOCK.finditer(vlm_text))
-    # Combine and sort all table positions
-    all_tables = [(m.start(), m.end(), "md") for m in md_tables]
-    all_tables += [(m.start(), m.end(), "latex") for m in latex_tables]
-    all_tables.sort(key=lambda x: x[0])
-    # Remove overlapping matches (prefer earlier match)
-    filtered: list[tuple[int, int, str]] = []
-    last_end = -1
-    for start, end, kind in all_tables:
-        if start >= last_end:
-            filtered.append((start, end, kind))
-            last_end = end
-    vlm_table_count = len(filtered)
-    tf_table_count = len(table_markdowns)
-    if vlm_table_count != tf_table_count:
-        logger.warning(
-            f"Table count mismatch: VLM={vlm_table_count}, TableFormer={tf_table_count}. "
-            f"Using positional replacement for min({vlm_table_count}, {tf_table_count}) tables."
-        )
-    # Replace VLM tables with TableFormer tables (positional)
-    result_parts: list[str] = []
-    prev_end = 0
-    table_idx = 0
-    for start, end, kind in filtered:
-        result_parts.append(vlm_text[prev_end:start])
-        if table_idx < tf_table_count:
-            result_parts.append(table_markdowns[table_idx].strip() + "\n")
-            table_idx += 1
-        else:
-            # More VLM tables than TableFormer — keep VLM version
-            result_parts.append(vlm_text[start:end])
-        prev_end = end
-    result_parts.append(vlm_text[prev_end:])
-    # If there are remaining TableFormer tables not matched, append them
-    while table_idx < tf_table_count:
-        result_parts.append("\n\n" + table_markdowns[table_idx].strip() + "\n")
-        table_idx += 1
-    return "".join(result_parts)
-# ---------------------------------------------------------------------------
-# PDF to Page Images (parallel, optimized)
-# ---------------------------------------------------------------------------
-def _render_single_page(
-    input_path: Path, page_idx: int, dpi: int
-) -> tuple[int, Optional[bytes]]:
-    """Render a single PDF page to PNG bytes with CLAHE preprocessing.
-    Returns (page_idx, png_bytes) or (page_idx, None) on failure.
-    """
-    try:
-        images = convert_from_path(
-            str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
-        )
-        if not images:
-            return page_idx, None
-        img = images[0]
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-            tmp_path = tmp.name
-            img.save(tmp_path, format="PNG")
-        try:
-            _preprocess_image_for_ocr(tmp_path)
-            with open(tmp_path, "rb") as f:
-                return page_idx, f.read()
-        finally:
-            os.unlink(tmp_path)
-    except Exception as e:
-        logger.warning(f"Failed to render page {page_idx + 1}: {e}")
-        return page_idx, None
-def _pdf_to_page_images(
-    input_path: Path,
-    request_id: str,
-    start_page: int = 0,
-    end_page: Optional[int] = None,
-) -> list[tuple[int, bytes]]:
-    """Convert PDF pages to PNG image bytes using parallel rendering.
-    Uses ThreadPoolExecutor for concurrent page rendering.
-    Returns list of (page_no, png_bytes) tuples, sorted by page number.
-    """
-    try:
-        from pdf2image.pdf2image import pdfinfo_from_path
-        info = pdfinfo_from_path(str(input_path))
-        total_pages = info["Pages"]
-        last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
-    except Exception as e:
-        logger.warning(f"[{request_id}] Could not get PDF info: {e}")
-        return []
-    page_indices = list(range(start_page, last_page))
-    start_time = time.time()
-    page_images: list[tuple[int, bytes]] = []
-    # Render pages in parallel (4 threads — I/O bound, not CPU bound for poppler)
-    with ThreadPoolExecutor(max_workers=4) as executor:
-        futures = {
-            executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
-            for idx in page_indices
-        }
-        for future in as_completed(futures):
-            page_idx, png_bytes = future.result()
-            if png_bytes is not None:
-                page_images.append((page_idx, png_bytes))
-    page_images.sort(key=lambda x: x[0])
-    render_time = time.time() - start_time
-    logger.info(
-        f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
-        f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
-    )
-    return page_images
-# ---------------------------------------------------------------------------
-# Docling Converter (for TableFormer only)
-# ---------------------------------------------------------------------------
-def _create_converter(images_scale: float = 2.0) -> DocumentConverter:
-    """Create a Docling converter with Standard Pipeline.
-    Used ONLY for TableFormer on table pages (not for full document OCR).
-    """
-    device = _get_device()
-    logger.info(f"Creating converter with device: {device}")
-    pipeline_options = PdfPipelineOptions()
-    pipeline_options.do_ocr = True
-    pipeline_options.do_table_structure = True
-    pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
-    pipeline_options.table_structure_options.do_cell_matching = True
-    pipeline_options.ocr_options = RapidOcrOptions()
-    pipeline_options.ocr_options.force_full_page_ocr = True
-    pipeline_options.generate_page_images = True
-    pipeline_options.images_scale = images_scale
-    pipeline_options.generate_picture_images = True
-    pipeline_options.accelerator_options = AcceleratorOptions(
-        device=device,
-        num_threads=0 if device == "cuda" else 4,
-    )
-    converter = DocumentConverter(
-        format_options={
-            InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options,
-                backend=DoclingParseV4DocumentBackend,
-            )
-        }
-    )
-    return converter
-def _get_converter() -> DocumentConverter:
-    """Get or create the global converter instance."""
-    global _converter
-    if _converter is None:
-        _converter = _create_converter(images_scale=IMAGES_SCALE)
-    return _converter
-# ---------------------------------------------------------------------------
-# VLM-First Conversion (Pass 1: VLM, Pass 2: TableFormer, Merge)
-# ---------------------------------------------------------------------------
-def _convert_document(
-    input_path: Path,
-    output_dir: Path,
-    images_scale: float,
-    include_images: bool,
-    request_id: str,
-    start_page: int = 0,
-    end_page: Optional[int] = None,
-) -> tuple:
-    """
-    VLM-first hybrid conversion.
-    Pass 1 (GPU): VLM OCR on ALL pages (fast, concurrent)
-    Detect: Find table pages from VLM markdown output
-    Pass 2 (CPU): Docling TableFormer ONLY on table pages (mini-PDF)
-    Merge: VLM text for all pages + TableFormer tables
-    Returns: (markdown_content, json_content, pages_processed, image_count)
-    """
-    overall_start = time.time()
-    # ---- RENDER ALL PAGES ----
-    page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
-    if not page_images:
-        logger.warning(f"[{request_id}] No page images — falling back to full Docling pipeline")
-        return _convert_document_full_docling(
-            input_path, output_dir, images_scale, include_images, request_id
-        )
-    render_time = time.time() - overall_start
-    # ---- PASS 1: VLM OCR ALL PAGES (GPU, concurrent) ----
-    logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
-    logger.info(f"[{request_id}] Sending {len(page_images)} pages to VLM ({VLM_CONCURRENCY} concurrent)")
-    vlm_page_texts: dict[int, Optional[str]] = {}
-    vlm_start = time.time()
-    with ThreadPoolExecutor(max_workers=VLM_CONCURRENCY) as executor:
-        future_to_page = {
-            executor.submit(_vlm_ocr_page, page_bytes, request_id, page_no + 1): page_no
-            for page_no, page_bytes in page_images
-        }
-        for future in as_completed(future_to_page):
-            page_no = future_to_page[future]
-            try:
-                vlm_text = future.result()
-                vlm_page_texts[page_no] = vlm_text
-                logger.info(
-                    f"[{request_id}] VLM processed page {page_no + 1} ({len(vlm_text)} chars)"
-                )
-            except Exception as e:
-                logger.warning(f"[{request_id}] VLM failed on page {page_no + 1}: {e}")
-                vlm_page_texts[page_no] = None
-    vlm_time = time.time() - vlm_start
-    logger.info(f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)")
-    # ---- DETECT TABLE PAGES ----
-    table_pages = _detect_table_pages(vlm_page_texts)
-    if table_pages:
-        logger.info(
-            f"[{request_id}] Tables detected on {len(table_pages)} pages: "
-            f"{sorted(p + 1 for p in table_pages)}"
-        )
-    else:
-        logger.info(f"[{request_id}] No tables detected — skipping table re-prompting")
-    # ---- PASS 2: GEMINI 2.5 FLASH ON TABLE PAGES ----
-    gemini_page_texts: dict[int, str] = {}
-    gemini_time = 0.0
-    if table_pages and GEMINI_API_KEY:
-        logger.info(
-            f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(table_pages)} table pages"
-        )
-        gemini_start = time.time()
-        # Build lookup: page_no → image bytes
-        page_image_map = {pno: pbytes for pno, pbytes in page_images}
-        with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
-            future_to_page = {
-                executor.submit(
-                    _gemini_extract_page,
-                    page_image_map[page_no],
-                    request_id,
-                    page_no,
-                ): page_no
-                for page_no in sorted(table_pages)
-                if page_no in page_image_map
-            }
-            for future in as_completed(future_to_page):
-                page_no = future_to_page[future]
-                try:
-                    gemini_text = future.result()
-                    if gemini_text:
-                        gemini_page_texts[page_no] = gemini_text
-                        logger.info(
-                            f"[{request_id}] Gemini processed page {page_no + 1} "
-                            f"({len(gemini_text)} chars)"
-                        )
-                    else:
-                        logger.warning(
-                            f"[{request_id}] Gemini returned empty for page {page_no + 1} "
-                            f"— falling back to VLM"
-                        )
-                except Exception as e:
-                    logger.warning(
-                        f"[{request_id}] Gemini failed on page {page_no + 1}: {e} "
-                        f"— falling back to VLM"
-                    )
-        gemini_time = time.time() - gemini_start
-        logger.info(
-            f"[{request_id}] Pass 2 completed in {gemini_time:.2f}s — "
-            f"{len(gemini_page_texts)}/{len(table_pages)} table pages extracted via Gemini"
-        )
-    elif table_pages and not GEMINI_API_KEY:
-        logger.warning(
-            f"[{request_id}] GEMINI_API_KEY not set — table pages will use VLM output only"
-        )
-    # ---- MERGE: VLM TEXT (non-table pages) + GEMINI (table pages) ----
-    md_parts: list[str] = []
-    image_count = 0
-    for page_no in sorted(vlm_page_texts.keys()):
-        if md_parts:
-            md_parts.append("\n\n")
-        if page_no in gemini_page_texts:
-            # Table page — use Gemini's superior output
-            md_parts.append(gemini_page_texts[page_no])
-        elif vlm_page_texts[page_no] is not None:
-            # Non-table page or Gemini fallback — use VLM output
-            md_parts.append(vlm_page_texts[page_no])
-        else:
-            md_parts.append(f"[Page {page_no + 1}: extraction failed]\n\n")
-    markdown_content = "".join(md_parts)
-    # Post-process: fix cross-page artifacts, deduplicate headers, clean tables
-    if len(vlm_page_texts) > 1:
-        markdown_content = _post_process_merged_markdown(markdown_content)
-    pages_processed = len(vlm_page_texts)
-    total_time = time.time() - overall_start
-    logger.info(
-        f"[{request_id}] VLM+Gemini conversion complete: {pages_processed} pages — "
-        f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
-        f"Gemini {gemini_time:.1f}s = {total_time:.2f}s total"
-    )
-    if pages_processed > 0:
-        logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
-    return markdown_content, None, pages_processed, image_count
-def _convert_document_full_docling(
-    input_path: Path,
-    output_dir: Path,
-    images_scale: float,
-    include_images: bool,
-    request_id: str,
-) -> tuple:
-    """Fallback: full Docling pipeline when page images are unavailable."""
-    logger.info(f"[{request_id}] Fallback: running full Docling pipeline")
-    converter = _get_converter()
-    start_time = time.time()
-    result = converter.convert(input_path)
-    doc = result.document
-    if doc is None:
-        raise ValueError("Docling failed to parse document")
-    elapsed = time.time() - start_time
-    logger.info(f"[{request_id}] Full Docling pipeline completed in {elapsed:.2f}s")
-    markdown_content = doc.export_to_markdown()
-    pages_processed = len(
-        set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
-    )
-    image_count = 0
-    if include_images:
-        image_dir = output_dir / "images"
-        image_dir.mkdir(parents=True, exist_ok=True)
-        for element, _ in doc.iterate_items():
-            if isinstance(element, PictureItem):
-                if element.image and element.image.pil_image:
-                    pg = element.prov[0].page_no if element.prov else 0
-                    image_id = element.self_ref.split("/")[-1]
-                    image_name = f"page_{pg}_{image_id}.png"
-                    image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
-                    image_path = image_dir / image_name
-                    try:
-                        element.image.pil_image.save(image_path, format="PNG")
-                        image_count += 1
-                    except Exception:
-                        pass
-    return markdown_content, None, pages_processed, image_count
-# ---------------------------------------------------------------------------
-# Images Zip Helper
-# ---------------------------------------------------------------------------
-def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
-    """Create a zip file from extracted images."""
-    image_dir = output_dir / "images"
-    if not image_dir.exists():
-        return None, 0
-    image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
-    zip_buffer = io.BytesIO()
-    image_count = 0
-    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
-        for img_path in image_dir.glob("*"):
-            if img_path.is_file() and img_path.suffix.lower() in image_extensions:
-                try:
-                    zf.write(img_path, f"images/{img_path.name}")
-                    image_count += 1
-                except Exception as e:
-                    logger.warning(f"Failed to add image {img_path} to zip: {e}")
-    if image_count == 0:
-        return None, 0
-    return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
 # ---------------------------------------------------------------------------
 # Application Lifespan
 # ---------------------------------------------------------------------------
@@ -1514,23 +56,13 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Startup: initialize Docling converter and check vLLM."""
     logger.info("=" * 60)
-    logger.info("Starting Docling VLM Parser API v4.0.0...")
-    device = _get_device()
-    logger.info(f"Device: {device}")
-    if device == "cuda":
-        logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
-        logger.info(f"CUDA Version: {torch.version.cuda}")
-        logger.info(
-            f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB"
-        )
-    logger.info(f"VLM Model: {VLM_MODEL}")
-    logger.info(f"VLM Endpoint: http://{VLM_HOST}:{VLM_PORT}")
-    logger.info(f"VLM Timeout: {VLM_TIMEOUT}s, Concurrency: {VLM_CONCURRENCY}")
     logger.info(f"Render DPI: {RENDER_DPI}")
     logger.info(f"Images scale: {IMAGES_SCALE}")
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
@@ -1538,27 +70,8 @@ async def lifespan(app: FastAPI):
     logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
     logger.info(f"Gemini Timeout: {GEMINI_TIMEOUT}s, Concurrency: {GEMINI_CONCURRENCY}")
-    # Verify vLLM is running
-    logger.info("Checking vLLM server...")
-    try:
-        async with httpx.AsyncClient(timeout=10) as client:
-            resp = await client.get(f"http://{VLM_HOST}:{VLM_PORT}/health")
-            resp.raise_for_status()
-        logger.info("vLLM server is healthy")
-    except Exception as e:
-        logger.error(f"vLLM server not available: {e}")
-        raise RuntimeError(f"vLLM server not available at {VLM_HOST}:{VLM_PORT}")
-    # Pre-initialize Docling converter
-    logger.info("Pre-loading Docling models (DocLayNet + TableFormer + RapidOCR)...")
-    try:
-        _get_converter()
-        logger.info("Docling models loaded successfully")
-    except Exception as e:
-        logger.warning(f"Failed to pre-load Docling models: {e}")
     logger.info("=" * 60)
-    logger.info("Docling VLM Parser API ready (VLM + Gemini hybrid: Qwen3-VL + Gemini tables)")
     logger.info("=" * 60)
     yield
     logger.info("Shutting down Docling VLM Parser API...")
@@ -1570,8 +83,8 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
     title="Docling VLM Parser API",
-    description="VLM + Gemini hybrid parser: Qwen3-VL text + Gemini 3 Flash tables",
-    version="4.0.0",
     lifespan=lifespan,
 )
@@ -1584,23 +97,11 @@ app = FastAPI(
 @app.get("/", response_model=HealthResponse)
 async def health_check() -> HealthResponse:
     """Health check endpoint."""
-    device = _get_device()
-    vlm_status = "unknown"
-    try:
-        async with httpx.AsyncClient(timeout=5) as client:
-            resp = await client.get(f"http://{VLM_HOST}:{VLM_PORT}/health")
-            vlm_status = "healthy" if resp.status_code == 200 else "unhealthy"
-    except Exception:
-        vlm_status = "unreachable"
     return HealthResponse(
         status="healthy",
-        version="4.0.0",
-        device=device,
-        gpu_name=None,
-        vlm_model=f"active (gemini: {'configured' if GEMINI_API_KEY else 'not set'})",
-        vlm_status=vlm_status,
         images_scale=IMAGES_SCALE,
     )
@@ -1615,7 +116,7 @@ async def parse_document(
     include_images: bool = Form(default=False, description="Include extracted images"),
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
-    """Parse a document file using VLM-first hybrid pipeline."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
@@ -1654,9 +155,7 @@ async def parse_document(
             detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
         )
-    use_images_scale = images_scale if images_scale is not None else IMAGES_SCALE
-    logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
     logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
     temp_dir = tempfile.mkdtemp()
@@ -1672,7 +171,6 @@ async def parse_document(
             _convert_document,
             input_path,
             output_dir,
-            use_images_scale,
             include_images,
             request_id,
             start_page,
@@ -1699,8 +197,8 @@ async def parse_document(
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
-            device_used=_get_device(),
-            vlm_model=VLM_MODEL,
         )
     except Exception as e:
@@ -1722,7 +220,7 @@ async def parse_document_from_url(
     request: URLParseRequest,
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
-    """Parse a document from a URL using VLM-first hybrid pipeline."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
@@ -1782,9 +280,7 @@ async def parse_document_from_url(
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
-        use_images_scale = request.images_scale if request.images_scale is not None else IMAGES_SCALE
-        logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
         logger.info(
             f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
         )
@@ -1793,7 +289,6 @@ async def parse_document_from_url(
             _convert_document,
             input_path,
             output_dir,
-            use_images_scale,
             request.include_images,
             request_id,
             request.start_page,
@@ -1820,8 +315,8 @@ async def parse_document_from_url(
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
-            device_used=_get_device(),
-            vlm_model=VLM_MODEL,
         )
     except httpx.HTTPError as e:

 """
+Docling VLM Parser API v5.0.0
+A FastAPI service using a PaddleOCR-VL-1.5 + Gemini hybrid architecture for document parsing:
+  Pass 1 (GPU):  PaddleOCR-VL-1.5 on full PDF (native document parsing, 0.9B params)
+  Pass 2 (API):  Gemini 3 Flash on table pages only (highest quality tables)
+  Post:          Cross-page artifact removal, table cleanup, deduplication, footer removal
+v5.0.0 — PaddleOCR-VL-1.5 + Gemini hybrid:
+  - Core: PaddleOCR-VL-1.5 replaces Qwen3-VL + Docling entirely
+  - Quality: Gemini 3 Flash used ONLY for pages with tables (better table accuracy)
+  - Speed: PaddleOCR handles PDF natively — no separate image rendering for OCR
+  - GPU: Runs on T4 (16GB VRAM) — much smaller than A100 requirement
+  - Quality: Enhanced post-processing — aggressive footer/artifact removal
 """
 import asyncio
 import re
 import shutil
 import tempfile
 import time
 from contextlib import asynccontextmanager
 from pathlib import Path
+from typing import Optional
 from uuid import uuid4
 import httpx
 from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
+from auth import _validate_url, verify_token
+from config import (
+    GEMINI_API_KEY,
+    GEMINI_CONCURRENCY,
+    GEMINI_MODEL,
+    GEMINI_TIMEOUT,
+    IMAGES_SCALE,
+    MAX_FILE_SIZE_BYTES,
+    MAX_FILE_SIZE_MB,
+    RENDER_DPI,
+    logger,
 )
+from models import HealthResponse, ParseResponse, URLParseRequest
+from pipeline import (
+    _convert_document,
+    _create_images_zip,
+    _get_pipeline,
+    _save_downloaded_content,
+    _save_uploaded_file,
 )
 # ---------------------------------------------------------------------------
 # Application Lifespan
 # ---------------------------------------------------------------------------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Startup: initialize PaddleOCR-VL-1.5 pipeline."""
     logger.info("=" * 60)
+    logger.info("Starting Docling VLM Parser API v5.0.0...")
+    logger.info("Initializing PaddleOCR-VL-1.5 pipeline...")
+    _get_pipeline()
+    logger.info("PaddleOCR-VL-1.5 ready")
     logger.info(f"Render DPI: {RENDER_DPI}")
     logger.info(f"Images scale: {IMAGES_SCALE}")
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
     logger.info(f"Gemini Timeout: {GEMINI_TIMEOUT}s, Concurrency: {GEMINI_CONCURRENCY}")
     logger.info("=" * 60)
+    logger.info("Docling VLM Parser API ready (PaddleOCR-VL-1.5 + Gemini hybrid)")
     logger.info("=" * 60)
     yield
     logger.info("Shutting down Docling VLM Parser API...")
 app = FastAPI(
     title="Docling VLM Parser API",
+    description="PaddleOCR-VL-1.5 + Gemini 3 Flash hybrid parser",
+    version="5.0.0",
     lifespan=lifespan,
 )
 @app.get("/", response_model=HealthResponse)
 async def health_check() -> HealthResponse:
     """Health check endpoint."""
     return HealthResponse(
         status="healthy",
+        version="5.0.0",
+        model="PaddleOCR-VL-1.5",
+        gemini_status="configured" if GEMINI_API_KEY else "not set",
         images_scale=IMAGES_SCALE,
     )
     include_images: bool = Form(default=False, description="Include extracted images"),
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
+    """Parse a document file using PaddleOCR-VL-1.5 + Gemini hybrid pipeline."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
             detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
         )
+    logger.info(f"[{request_id}] Model: PaddleOCR-VL-1.5")
     logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
     temp_dir = tempfile.mkdtemp()
             _convert_document,
             input_path,
             output_dir,
             include_images,
             request_id,
             start_page,
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
+            device_used="gpu",
+            vlm_model="PaddleOCR-VL-1.5",
         )
     except Exception as e:
     request: URLParseRequest,
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
+    """Parse a document from a URL using PaddleOCR-VL-1.5 + Gemini hybrid pipeline."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
+        logger.info(f"[{request_id}] Model: PaddleOCR-VL-1.5")
         logger.info(
             f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
         )
             _convert_document,
             input_path,
             output_dir,
             request.include_images,
             request_id,
             request.start_page,
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
+            device_used="gpu",
+            vlm_model="PaddleOCR-VL-1.5",
         )
     except httpx.HTTPError as e:

auth.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""Bearer token authentication and URL validation (SSRF protection)."""
+import ipaddress
+import secrets
+import socket
+from urllib.parse import urlparse
+from fastapi import Depends, HTTPException
+from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from config import API_TOKEN, BLOCKED_HOSTNAMES
+security = HTTPBearer()
+def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
+    """Verify the API token from Authorization header."""
+    if not API_TOKEN:
+        raise HTTPException(
+            status_code=500,
+            detail="No API token configured on server",
+        )
+    token = credentials.credentials
+    if not secrets.compare_digest(token, API_TOKEN):
+        raise HTTPException(
+            status_code=401,
+            detail="Invalid API token",
+        )
+    return token
+def _validate_url(url: str) -> None:
+    """Validate URL to prevent SSRF attacks."""
+    try:
+        parsed = urlparse(url)
+    except Exception as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid URL format: {str(e)}",
+        )
+    if parsed.scheme not in ("http", "https"):
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid URL scheme '{parsed.scheme}'. Only http and https are allowed.",
+        )
+    hostname = parsed.hostname
+    if not hostname:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid URL: missing hostname.",
+        )
+    hostname_lower = hostname.lower()
+    if hostname_lower in BLOCKED_HOSTNAMES:
+        raise HTTPException(
+            status_code=400,
+            detail="Access to internal/metadata services is not allowed.",
+        )
+    blocked_patterns = ["metadata", "internal", "localhost", "127.0.0.1", "::1"]
+    for pattern in blocked_patterns:
+        if pattern in hostname_lower:
+            raise HTTPException(
+                status_code=400,
+                detail="Access to internal/metadata services is not allowed.",
+            )
+    try:
+        ip_str = socket.gethostbyname(hostname)
+        ip = ipaddress.ip_address(ip_str)
+    except socket.gaierror:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Could not resolve hostname: {hostname}",
+        )
+    except ValueError as e:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid IP address resolved: {str(e)}",
+        )
+    if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
+        raise HTTPException(
+            status_code=400,
+            detail="Access to private/internal IP addresses is not allowed.",
+        )

config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Configuration, environment variables, and logging setup for the Docling VLM Parser."""
+import logging
+import os
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)-8s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger("docling-parser")
+# Security
+API_TOKEN = os.getenv("API_TOKEN")
+# Configuration
+IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
+MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
+MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
+RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
+# Gemini API Configuration (table page enhancement)
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
+GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
+GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8"))
+# Blocked hostnames for SSRF protection
+BLOCKED_HOSTNAMES = {
+    "localhost",
+    "metadata",
+    "metadata.google.internal",
+    "metadata.google",
+    "169.254.169.254",
+    "fd00:ec2::254",
+}

gemini.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Gemini API extraction function for table page enhancement."""
+import base64
+import re
+import time
+from typing import Optional
+import httpx
+from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger
+# Strip code fence wrappers (Gemini sometimes wraps output)
+_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
+_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
+def _gemini_extract_page(
+    page_image_bytes: bytes, request_id: str = "", page_no: int = 0
+) -> Optional[str]:
+    """Send a page image to Gemini 2.5 Flash for high-quality extraction.
+    Used for table pages where PaddleOCR output is insufficient.
+    Returns the full page markdown (text + tables), or None on failure.
+    """
+    if not GEMINI_API_KEY:
+        logger.warning(f"[{request_id}] GEMINI_API_KEY not set — skipping Gemini extraction")
+        return None
+    b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
+    payload = {
+        "contents": [
+            {
+                "parts": [
+                    {
+                        "inline_data": {
+                            "mime_type": "image/png",
+                            "data": b64_image,
+                        }
+                    },
+                    {
+                        "text": (
+                            "Convert this document page to clean markdown format.\n\n"
+                            "Rules:\n"
+                            "- Extract ALL text content exactly as written — do not paraphrase or summarize\n"
+                            "- Use ## for main section headings and ### for subsection headings\n"
+                            "- Preserve lists, paragraphs, bullet points, and document structure\n"
+                            "- For tables:\n"
+                            "  * Read EVERY column header exactly as printed on the page\n"
+                            "  * Include ALL columns even if the table is very wide\n"
+                            "  * Format as markdown tables with | delimiters and --- separator rows\n"
+                            "  * Each data row MUST have the same number of | cells as the header row\n"
+                            "  * Preserve multi-line cell content — use <br> for line breaks within cells\n"
+                            "  * For financial/lease tables, preserve ALL numbers, dates, and terms exactly\n"
+                            "  * Add spaces between words — never concatenate (e.g., 'CAP Rate' not 'CAPRate')\n"
+                            "- Do NOT wrap output in code fences (no ```)\n"
+                            "- Do NOT add image descriptions, [Image:] tags, or describe visual elements\n"
+                            "- Do NOT include page headers, footers, page numbers, or repeated branding\n"
+                            "- Do NOT extract text from map images or photographs\n"
+                            "- Output ONLY the extracted markdown content, nothing else"
+                        ),
+                    },
+                ],
+            }
+        ],
+        "generationConfig": {
+            "temperature": 0.1,
+            "maxOutputTokens": 32768,
+        },
+    }
+    url = (
+        f"https://generativelanguage.googleapis.com/v1beta/models/"
+        f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
+    )
+    for attempt in range(1, 3):
+        try:
+            timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
+            response = httpx.post(url, json=payload, timeout=timeout)
+            if response.status_code == 429:
+                # Rate limited — wait and retry
+                logger.warning(
+                    f"[{request_id}] Gemini rate limited on page {page_no + 1}, "
+                    f"attempt {attempt}. Waiting 5s..."
+                )
+                time.sleep(5)
+                continue
+            if response.status_code != 200:
+                try:
+                    err = response.json()
+                    msg = str(err.get("error", {}).get("message", str(err)[:300]))
+                except Exception:
+                    msg = response.text[:300]
+                logger.error(
+                    f"[{request_id}] Gemini error ({response.status_code}) "
+                    f"page {page_no + 1}: {msg}"
+                )
+                if attempt == 1:
+                    continue
+                return None
+            result = response.json()
+            candidates = result.get("candidates", [])
+            if not candidates:
+                logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
+                return None
+            parts = candidates[0].get("content", {}).get("parts", [])
+            if not parts:
+                return None
+            content = parts[0].get("text", "")
+            # Clean up: strip code fences if Gemini wraps output
+            content = _CODE_FENCE_PATTERN.sub("", content)
+            content = _CODE_FENCE_END.sub("", content)
+            return content.strip() if content.strip() else None
+        except (httpx.TimeoutException, httpx.ConnectError) as e:
+            if attempt == 1:
+                logger.warning(
+                    f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}. Retrying..."
+                )
+                continue
+            logger.error(f"[{request_id}] Gemini failed after 2 attempts on page {page_no + 1}: {e}")
+            return None
+    return None

models.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""Pydantic models for API request/response schemas."""
+from typing import Optional, Union
+from pydantic import BaseModel
+class ParseResponse(BaseModel):
+    """Response model for document parsing."""
+    success: bool
+    markdown: Optional[str] = None
+    json_content: Optional[Union[dict, list]] = None
+    images_zip: Optional[str] = None
+    image_count: int = 0
+    error: Optional[str] = None
+    pages_processed: int = 0
+    device_used: Optional[str] = None
+    vlm_model: Optional[str] = None
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str
+    version: str
+    model: str
+    gemini_status: str = "unknown"
+    images_scale: float = 2.0
+class URLParseRequest(BaseModel):
+    """Request model for URL-based parsing."""
+    url: str
+    output_format: str = "markdown"
+    images_scale: Optional[float] = None
+    start_page: int = 0
+    end_page: Optional[int] = None
+    include_images: bool = False

pipeline.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""PaddleOCR-VL pipeline, hybrid conversion logic, and file helpers."""
+import base64
+import io
+import re
+import shutil
+import time
+import zipfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import BinaryIO, Optional
+from paddleocr import PaddleOCRVL
+from config import GEMINI_API_KEY, GEMINI_CONCURRENCY, GEMINI_MODEL, logger
+from gemini import _gemini_extract_page
+from postprocess import _post_process_merged_markdown
+from rendering import _pdf_to_page_images
+# Global PaddleOCR-VL pipeline instance
+_pipeline = None
+def _get_pipeline():
+    """Get or create the global PaddleOCR-VL-1.5 pipeline instance."""
+    global _pipeline
+    if _pipeline is None:
+        _pipeline = PaddleOCRVL()
+    return _pipeline
+def _page_has_tables(result) -> bool:
+    """Check if PaddleOCR result contains table elements from layout analysis.
+    Uses layout detection labels and falls back to markdown pattern matching.
+    """
+    try:
+        # Try accessing layout detection results
+        if hasattr(result, 'json') and result.json:
+            json_data = result.json
+            if isinstance(json_data, dict):
+                for block in json_data.get('layout_det', []):
+                    if block.get('label', '').lower() == 'table':
+                        return True
+        # Fallback: check markdown content for table patterns
+        md = result.markdown
+        if isinstance(md, dict):
+            md_text = md.get('markdown_texts', '')
+        else:
+            md_text = str(md)
+        return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
+    except Exception:
+        return False
+def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
+    """Sync helper to save uploaded file to disk."""
+    with open(input_path, "wb") as f:
+        shutil.copyfileobj(file_obj, f)
+def _save_downloaded_content(input_path: Path, content: bytes) -> None:
+    """Sync helper to save downloaded content to disk."""
+    with open(input_path, "wb") as f:
+        f.write(content)
+def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
+    """Create a zip file from extracted images."""
+    image_dir = output_dir / "images"
+    if not image_dir.exists():
+        return None, 0
+    image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
+    zip_buffer = io.BytesIO()
+    image_count = 0
+    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
+        for img_path in image_dir.glob("*"):
+            if img_path.is_file() and img_path.suffix.lower() in image_extensions:
+                try:
+                    zf.write(img_path, f"images/{img_path.name}")
+                    image_count += 1
+                except Exception as e:
+                    logger.warning(f"Failed to add image {img_path} to zip: {e}")
+    if image_count == 0:
+        return None, 0
+    return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
+def _convert_document(
+    input_path: Path,
+    output_dir: Path,
+    include_images: bool,
+    request_id: str,
+    start_page: int = 0,
+    end_page: Optional[int] = None,
+) -> tuple:
+    """
+    PaddleOCR-VL-1.5 + Gemini hybrid conversion.
+    Pass 1 (GPU): PaddleOCR-VL-1.5 on full PDF (native document parsing)
+    Detect: Find table pages from layout analysis
+    Pass 2 (API): Gemini 3 Flash ONLY on table pages (high-quality tables)
+    Merge: Gemini for table pages, PaddleOCR for everything else
+    Returns: (markdown_content, json_content, pages_processed, image_count)
+    """
+    overall_start = time.time()
+    # ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
+    pipeline = _get_pipeline()
+    paddle_start = time.time()
+    output = pipeline.predict(str(input_path))
+    paddle_time = time.time() - paddle_start
+    # Collect per-page markdown and detect table pages
+    page_markdowns = []
+    table_pages = set()
+    for i, res in enumerate(output):
+        md_data = res.markdown
+        page_markdowns.append(md_data)
+        # Check if this page has tables from layout analysis
+        if _page_has_tables(res):
+            table_pages.add(i)
+    logger.info(
+        f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
+        f"in {paddle_time:.2f}s — {len(table_pages)} table pages detected"
+    )
+    # ---- PASS 2: Gemini on table pages only ----
+    gemini_page_texts: dict[int, str] = {}
+    gemini_time = 0.0
+    if table_pages and GEMINI_API_KEY:
+        logger.info(
+            f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(table_pages)} table pages "
+            f"({GEMINI_CONCURRENCY} concurrent)"
+        )
+        # Render table page images for Gemini
+        page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
+        page_image_map = {pno: pbytes for pno, pbytes in page_images}
+        gemini_start = time.time()
+        with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
+            futures = {
+                executor.submit(
+                    _gemini_extract_page, page_image_map[pno], request_id, pno
+                ): pno
+                for pno in table_pages
+                if pno in page_image_map
+            }
+            for future in as_completed(futures):
+                pno = futures[future]
+                try:
+                    text = future.result()
+                    if text:
+                        gemini_page_texts[pno] = text
+                        logger.info(
+                            f"[{request_id}] Gemini processed table page {pno + 1} "
+                            f"({len(text)} chars)"
+                        )
+                except Exception as e:
+                    logger.warning(f"[{request_id}] Gemini failed page {pno + 1}: {e}")
+        gemini_time = time.time() - gemini_start
+        logger.info(
+            f"[{request_id}] Pass 2 completed in {gemini_time:.2f}s — "
+            f"{len(gemini_page_texts)}/{len(table_pages)} table pages enhanced via Gemini"
+        )
+    elif table_pages and not GEMINI_API_KEY:
+        logger.warning(
+            f"[{request_id}] {len(table_pages)} table pages detected but GEMINI_API_KEY not set — "
+            f"using PaddleOCR output for tables"
+        )
+    # ---- MERGE: Gemini for table pages, PaddleOCR for others ----
+    md_parts: list[str] = []
+    for i, md_data in enumerate(page_markdowns):
+        if i in gemini_page_texts:
+            md_parts.append(gemini_page_texts[i])
+        else:
+            # Extract markdown text from PaddleOCR result
+            if isinstance(md_data, dict):
+                md_text = md_data.get("markdown_texts", "")
+            else:
+                md_text = str(md_data)
+            md_parts.append(md_text)
+    markdown_content = "\n\n".join(md_parts)
+    # Post-process: fix cross-page artifacts, deduplicate headers, clean tables
+    pages_processed = len(page_markdowns)
+    if pages_processed > 1:
+        markdown_content = _post_process_merged_markdown(markdown_content)
+    total_time = time.time() - overall_start
+    logger.info(
+        f"[{request_id}] v5.0.0 conversion complete: {pages_processed} pages — "
+        f"PaddleOCR {paddle_time:.1f}s + Gemini {gemini_time:.1f}s = {total_time:.2f}s total"
+    )
+    if pages_processed > 0:
+        logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
+    return markdown_content, None, pages_processed, 0

postprocess.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""Post-processing functions and regex patterns for markdown cleanup."""
+import re
+# ---------------------------------------------------------------------------
+# Post-processing regex patterns
+# ---------------------------------------------------------------------------
+# Day-of-week date lines (e.g., "Thursday, October 31, 2024")
+_STANDALONE_DATE = re.compile(
+    r"^\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
+    r"(?:January|February|March|April|May|June|July|August|September|"
+    r"October|November|December)\s+\d{1,2},\s+\d{4}\s*$",
+    re.MULTILINE,
+)
+# Standalone time (e.g., "11:30 AM")
+_STANDALONE_TIME = re.compile(r"^\s*\d{1,2}:\d{2}\s*(?:AM|PM)\s*$", re.MULTILINE)
+# Page footer patterns: "N | address" or "N address N" (e.g., "2 | 8575 W Golf Rd, Niles, IL 60714 | 3")
+_PAGE_FOOTER = re.compile(
+    r"^\s*\d{1,3}\s*\|?\s*\d{2,5}\s+\w.*(?:Rd|St|Ave|Blvd|Dr|Ln|Way|Ct)\b.*\d{5}.*$",
+    re.MULTILINE,
+)
+# Standalone page number lines (e.g., "12" alone on a line)
+_STANDALONE_PAGE_NUM = re.compile(r"^\s*\d{1,3}\s*$", re.MULTILINE)
+# Branding footer lines: "Text | Text | N" or "Text | Text - Text N" pattern
+# Matches lines with 2+ pipe-separated segments ending in a page number,
+# where total line length > 30 chars (to avoid matching short legitimate text)
+_BRANDING_FOOTER = re.compile(
+    r"^\s*[A-Za-z][^|]{5,}\|[^|]+\|?\s*\d{1,3}\s*$",
+    re.MULTILINE,
+)
+# Short repeated location lines that appear as page artifacts (e.g., "Niles, IL" alone)
+_SHORT_LOCATION_LINE = re.compile(
+    r"^\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s*$", re.MULTILINE
+)
+# Numbered section pattern: "N. TITLE" where N is 1-99 and TITLE is mostly uppercase
+_NUMBERED_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][A-Z\s\-/&,]+(?:\.\s*)?)")
+# Table row with ALL empty cells (e.g., "| | | | |")
+_EMPTY_TABLE_ROW = re.compile(r"^\|(?:\s*\|)+\s*$", re.MULTILINE)
+# Trailing empty cells in a table row (e.g., "| data | data | | | |")
+_TRAILING_EMPTY_CELLS = re.compile(r"(?:\s*\|\s*){2,}\s*$")
+# Table separator row (e.g., "|---|---|---|")
+_TABLE_SEP_ROW = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$")
+# ---------------------------------------------------------------------------
+# Post-Processing Functions
+# ---------------------------------------------------------------------------
+def _post_process_merged_markdown(content: str) -> str:
+    """Post-process merged multi-page markdown to fix cross-page artifacts.
+    Applied after all pages are concatenated. Fixes:
+    - Duplicate document headings (re-extracted page headers)
+    - Duplicate short metadata lines (subtitles, dates repeated per page)
+    - Page footer/header artifacts (standalone dates, times, page numbers)
+    - Numbered section heading normalization (consistent ## levels)
+    - Table artifacts (empty rows, trailing empty cells)
+    - Cross-page table continuations (merge split tables)
+    - Excessive whitespace
+    """
+    content = _deduplicate_headings(content)
+    content = _deduplicate_short_blocks(content)
+    content = _remove_page_boundary_artifacts(content)
+    content = _normalize_numbered_headings(content)
+    content = _clean_table_artifacts(content)
+    content = _merge_split_tables(content)
+    # Normalize runs of 4+ newlines to 3
+    content = re.sub(r"\n{4,}", "\n\n\n", content)
+    return content.strip()
+def _deduplicate_headings(content: str) -> str:
+    """Remove duplicate heading lines, keeping only the first occurrence.
+    When processing each page, page headers/document titles may be re-extracted.
+    This removes exact duplicate headings while preserving table rows and body text.
+    """
+    lines = content.split("\n")
+    seen_headings: set[str] = set()
+    result: list[str] = []
+    for line in lines:
+        stripped = line.strip()
+        if stripped.startswith("#"):
+            # Normalize heading for comparison (lowercase, strip trailing #)
+            key = stripped.lstrip("#").strip().lower()
+            if key and key in seen_headings:
+                continue  # Skip duplicate heading
+            if key:
+                seen_headings.add(key)
+        result.append(line)
+    return "\n".join(result)
+def _deduplicate_short_blocks(content: str) -> str:
+    """Remove duplicate short text blocks that repeat across pages.
+    When processing each page, document subtitles, metadata lines, and other
+    short repeating text may be re-extracted. This removes exact duplicates
+    of short non-table blocks (< 120 chars).
+    """
+    blocks = content.split("\n\n")
+    seen: set[str] = set()
+    result: list[str] = []
+    for block in blocks:
+        stripped = block.strip()
+        if not stripped:
+            result.append(block)
+            continue
+        # Only deduplicate short, non-table, non-heading blocks
+        is_table = stripped.startswith("|") and "|" in stripped[1:]
+        is_heading = stripped.startswith("#")
+        if is_table or is_heading or len(stripped) > 120:
+            result.append(block)
+            continue
+        key = stripped.lower()
+        if key in seen:
+            continue  # Skip duplicate short block
+        seen.add(key)
+        result.append(block)
+    return "\n\n".join(result)
+def _remove_page_boundary_artifacts(content: str) -> str:
+    """Remove page footer/header artifacts like standalone dates, times, page numbers, and footers."""
+    content = _STANDALONE_DATE.sub("", content)
+    content = _STANDALONE_TIME.sub("", content)
+    content = _PAGE_FOOTER.sub("", content)
+    content = _STANDALONE_PAGE_NUM.sub("", content)
+    # Remove repeated patterns (only removed when they appear 3+ times)
+    content = _remove_repeated_lines(content, _BRANDING_FOOTER, min_repeats=3)
+    content = _remove_repeated_lines(content, _SHORT_LOCATION_LINE, min_repeats=3)
+    return content
+def _remove_repeated_lines(content: str, pattern: re.Pattern, min_repeats: int = 3) -> str:
+    """Remove lines matching a pattern that appear min_repeats+ times (clearly artifacts)."""
+    counts: dict[str, int] = {}
+    for m in pattern.finditer(content):
+        key = m.group(0).strip().lower()
+        counts[key] = counts.get(key, 0) + 1
+    repeated = {k for k, v in counts.items() if v >= min_repeats}
+    if not repeated:
+        return content
+    lines = content.split("\n")
+    result: list[str] = []
+    for line in lines:
+        if line.strip().lower() in repeated:
+            continue
+        result.append(line)
+    return "\n".join(result)
+def _normalize_numbered_headings(content: str) -> str:
+    """Normalize numbered section headings to consistent ## level.
+    Inconsistently formatted numbered sections like "3. OCCUPANCY" —
+    some get ## headings, some are plain text. This detects the pattern
+    and ensures all numbered sections at the same level use ## headings.
+    """
+    lines = content.split("\n")
+    result: list[str] = []
+    # First pass: detect which numbered sections exist and their heading status
+    sections_with_heading: set[int] = set()
+    sections_without_heading: set[int] = set()
+    for line in lines:
+        stripped = line.strip()
+        # Already a heading like "## 3. OCCUPANCY"
+        heading_match = re.match(r"^#{1,3}\s+(\d{1,2})\.\s+[A-Z]", stripped)
+        if heading_match:
+            sections_with_heading.add(int(heading_match.group(1)))
+            continue
+        # Plain text like "3. OCCUPANCY. Tenant shall..."
+        plain_match = _NUMBERED_SECTION.match(stripped)
+        if plain_match:
+            sections_without_heading.add(int(plain_match.group(1)))
+    # If there's a mix of headed and non-headed numbered sections, normalize
+    if sections_with_heading and sections_without_heading:
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            # Check if this is a non-headed numbered section that should be a heading
+            plain_match = _NUMBERED_SECTION.match(stripped)
+            if plain_match:
+                section_num = int(plain_match.group(1))
+                if section_num in sections_without_heading:
+                    # Check that it looks like a section start (followed by text)
+                    # Split at the first sentence end to make the heading
+                    # Extract just "N. TITLE." as heading, keep body text
+                    title_end = plain_match.end()
+                    title = stripped[:title_end].rstrip(".")
+                    body = stripped[title_end:].strip()
+                    if body:
+                        result.append(f"## {title}")
+                        result.append(body)
+                    else:
+                        result.append(f"## {title}")
+                    continue
+            result.append(line)
+    else:
+        result = lines
+    return "\n".join(result)
+def _clean_table_artifacts(content: str) -> str:
+    """Clean table formatting artifacts.
+    - Removes table rows where ALL cells are empty
+    - Strips trailing empty cells from table rows
+    - Removes orphaned separator rows not preceded by a header
+    """
+    lines = content.split("\n")
+    result: list[str] = []
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        # Skip completely empty table rows (| | | | |)
+        if _EMPTY_TABLE_ROW.match(stripped):
+            continue
+        # Clean trailing empty cells from table data rows
+        if stripped.startswith("|") and "|" in stripped[1:]:
+            # Don't touch separator rows
+            if not _TABLE_SEP_ROW.match(stripped):
+                # Remove trailing empty cells
+                cleaned = _TRAILING_EMPTY_CELLS.sub(" |", stripped)
+                result.append(cleaned)
+                continue
+        result.append(line)
+    return "\n".join(result)
+def _is_table_line(line: str) -> bool:
+    """Check if a line is a markdown table row or separator."""
+    s = line.strip()
+    return bool(s.startswith("|") and s.endswith("|") and s.count("|") >= 3)
+def _count_columns(line: str) -> int:
+    """Count the number of columns in a table row."""
+    s = line.strip()
+    if not s.startswith("|"):
+        return 0
+    # Split by | and count non-boundary segments
+    parts = s.split("|")
+    # First and last are empty strings from leading/trailing |
+    return max(0, len(parts) - 2)
+def _merge_split_tables(content: str) -> str:
+    """Merge table continuations that were split across pages.
+    Detects when non-table content (whitespace, duplicate metadata) separates
+    what should be a single table, and merges the data rows.
+    """
+    lines = content.split("\n")
+    result: list[str] = []
+    i = 0
+    while i < len(lines):
+        result.append(lines[i])
+        i += 1
+        # Check if we just appended a table row and the next chunk looks like
+        # a table continuation (another table with similar column count)
+        if not _is_table_line(result[-1]):
+            continue
+        last_table_cols = _count_columns(result[-1])
+        if last_table_cols < 2:
+            continue
+        # Look ahead past empty lines / short non-table lines
+        j = i
+        gap_lines: list[str] = []
+        while j < len(lines):
+            s = lines[j].strip()
+            if s == "":
+                gap_lines.append(lines[j])
+                j += 1
+                continue
+            break
+        if j >= len(lines):
+            continue
+        # Check if the next non-empty line starts a table
+        if not _is_table_line(lines[j]):
+            continue
+        next_table_cols = _count_columns(lines[j])
+        # If column counts are close (within 30%), it's likely a continuation
+        if last_table_cols < 2 or next_table_cols < 2:
+            continue
+        ratio = min(last_table_cols, next_table_cols) / max(last_table_cols, next_table_cols)
+        if ratio < 0.7:
+            continue
+        # Check if the new table starts with header + separator (indicating
+        # re-extracted headers on the next page)
+        has_new_header = False
+        if _is_table_line(lines[j]):
+            # Look for a separator row in the next 1-2 lines
+            for k in range(j + 1, min(j + 3, len(lines))):
+                if _TABLE_SEP_ROW.match(lines[k].strip()):
+                    has_new_header = True
+                    break
+        if has_new_header:
+            # Skip the gap, skip the duplicate header + separator, keep data rows
+            # Find the separator row
+            skip_to = j
+            while skip_to < len(lines):
+                if _TABLE_SEP_ROW.match(lines[skip_to].strip()):
+                    skip_to += 1  # Skip past separator
+                    break
+                skip_to += 1
+            i = skip_to
+        else:
+            # No header — just skip the gap and append the continuation rows
+            i = j
+    return "\n".join(result)

rendering.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""PDF-to-page-images rendering and image preprocessing (CLAHE)."""
+import os
+import tempfile
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Optional
+import cv2
+from pdf2image import convert_from_path
+from config import RENDER_DPI, logger
+def _preprocess_image_for_ocr(image_path: str) -> str:
+    """Enhance image quality for better OCR accuracy.
+    Applies CLAHE contrast enhancement only (fast).
+    Denoising was removed in v3.2.1 — it added ~10s/page with minimal
+    benefit for VLM-based OCR which handles noise well.
+    """
+    img = cv2.imread(image_path)
+    if img is None:
+        return image_path
+    # CLAHE contrast enhancement on L channel
+    lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
+    l, a, b = cv2.split(lab)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    l = clahe.apply(l)
+    lab = cv2.merge([l, a, b])
+    img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
+    cv2.imwrite(image_path, img)
+    return image_path
+def _render_single_page(
+    input_path: Path, page_idx: int, dpi: int
+) -> tuple[int, Optional[bytes]]:
+    """Render a single PDF page to PNG bytes with CLAHE preprocessing.
+    Returns (page_idx, png_bytes) or (page_idx, None) on failure.
+    """
+    try:
+        images = convert_from_path(
+            str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
+        )
+        if not images:
+            return page_idx, None
+        img = images[0]
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            tmp_path = tmp.name
+            img.save(tmp_path, format="PNG")
+        try:
+            _preprocess_image_for_ocr(tmp_path)
+            with open(tmp_path, "rb") as f:
+                return page_idx, f.read()
+        finally:
+            os.unlink(tmp_path)
+    except Exception as e:
+        logger.warning(f"Failed to render page {page_idx + 1}: {e}")
+        return page_idx, None
+def _pdf_to_page_images(
+    input_path: Path,
+    request_id: str,
+    start_page: int = 0,
+    end_page: Optional[int] = None,
+) -> list[tuple[int, bytes]]:
+    """Convert PDF pages to PNG image bytes using parallel rendering.
+    Uses ThreadPoolExecutor for concurrent page rendering.
+    Returns list of (page_no, png_bytes) tuples, sorted by page number.
+    """
+    try:
+        from pdf2image.pdf2image import pdfinfo_from_path
+        info = pdfinfo_from_path(str(input_path))
+        total_pages = info["Pages"]
+        last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
+    except Exception as e:
+        logger.warning(f"[{request_id}] Could not get PDF info: {e}")
+        return []
+    page_indices = list(range(start_page, last_page))
+    start_time = time.time()
+    page_images: list[tuple[int, bytes]] = []
+    # Render pages in parallel (4 threads — I/O bound, not CPU bound for poppler)
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        futures = {
+            executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
+            for idx in page_indices
+        }
+        for future in as_completed(futures):
+            page_idx, png_bytes = future.result()
+            if png_bytes is not None:
+                page_images.append((page_idx, png_bytes))
+    page_images.sort(key=lambda x: x[0])
+    render_time = time.time() - start_time
+    logger.info(
+        f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
+        f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
+    )
+    return page_images

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-# Docling VLM Parser API Dependencies
-# Optimized for HuggingFace Spaces with vLLM + Qwen3-VL-30B-A3B
-# Docling - IBM's document parsing library (VLM pipeline support)
-docling>=2.15.0
 # Web framework
 fastapi>=0.115.0
@@ -11,23 +11,17 @@ uvicorn[standard]>=0.32.0
 # File upload handling
 python-multipart>=0.0.9
-# HTTP client for URL parsing and vLLM health checks
 httpx>=0.27.0
-# Type checking
 pydantic>=2.0.0
-# Image preprocessing for degraded documents
 opencv-python-headless>=4.10.0
-# ONNX Runtime for Docling's RapidOCR text detection
-onnxruntime>=1.19.0
-# PDF to image conversion for VLM OCR pass
 pdf2image>=1.17.0
-# PDF page extraction (for creating mini-PDFs with only table pages)
-pypdf>=4.0.0
-# HuggingFace Hub for model downloads
 huggingface-hub>=0.25.0

+# PaddleOCR-VL-1.5 + Gemini Hybrid Parser API Dependencies
+# PaddlePaddle GPU is installed separately in the Dockerfile (requires special index URL)
+# PaddleOCR with document parsing support (PaddleOCR-VL-1.5)
+paddleocr[doc-parser]
 # Web framework
 fastapi>=0.115.0
 # File upload handling
 python-multipart>=0.0.9
+# HTTP client for Gemini API calls and URL fetching
 httpx>=0.27.0
+# Request/response models
 pydantic>=2.0.0
+# Image preprocessing (CLAHE contrast enhancement)
 opencv-python-headless>=4.10.0
+# PDF page rendering for Gemini page images
 pdf2image>=1.17.0
+# Model utilities
 huggingface-hub>=0.25.0

start.sh CHANGED Viewed

@@ -1,84 +1,7 @@
-#!/usr/bin/env bash
-set -e
-# Force all output to be visible
-exec 2>&1
-echo "[startup] ====== Docling VLM Parser starting ======"
-echo "[startup] Date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
-echo "[startup] GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>&1 || echo 'NO GPU')"
-echo "[startup] HF cache: $(du -sh /home/user/.cache/huggingface 2>/dev/null || echo 'empty')"
-# ── Configuration ────────────────────────────────────────────────────────────
-VLLM_MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
-VLLM_HOST="127.0.0.1"
-VLLM_PORT="8000"
-HEALTH_URL="http://${VLLM_HOST}:${VLLM_PORT}/health"
-POLL_INTERVAL=5
-MAX_WAIT=600
-# ── Start vLLM server in background ─────────────────────────────────────────
-echo "[startup] Starting vLLM server with model: ${VLLM_MODEL}"
-python3 -m vllm.entrypoints.openai.api_server \
-    --model "${VLLM_MODEL}" \
-    --host "${VLLM_HOST}" \
-    --port "${VLLM_PORT}" \
-    --max-num-seqs 16 \
-    --max-model-len 65536 \
-    --gpu-memory-utilization 0.85 \
-    --dtype auto \
-    --trust-remote-code \
-    --limit-mm-per-prompt '{"image": 1}' \
-    2>&1 &
-VLLM_PID=$!
-echo "[startup] vLLM server started with PID ${VLLM_PID}"
-# ── Poll vLLM health endpoint until ready ────────────────────────────────────
-echo "[startup] Waiting for vLLM to become healthy (polling every ${POLL_INTERVAL}s, timeout ${MAX_WAIT}s)..."
-elapsed=0
-while [ "${elapsed}" -lt "${MAX_WAIT}" ]; do
-    # Check if vLLM process is still alive
-    if ! kill -0 "${VLLM_PID}" 2>/dev/null; then
-        echo "[startup] ERROR: vLLM process (PID ${VLLM_PID}) died during startup"
-        exit 1
-    fi
-    if curl -sf "${HEALTH_URL}" > /dev/null 2>&1; then
-        echo "[startup] vLLM is healthy after ${elapsed}s"
-        break
-    fi
-    # Heartbeat every 30s
-    if [ $((elapsed % 30)) -eq 0 ] && [ "${elapsed}" -gt 0 ]; then
-        echo "[startup] Still waiting for vLLM... ${elapsed}s elapsed"
-    fi
-    sleep "${POLL_INTERVAL}"
-    elapsed=$((elapsed + POLL_INTERVAL))
-done
-if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then
-    echo "[startup] ERROR: vLLM did not become healthy within ${MAX_WAIT}s"
-    echo "[startup] Killing vLLM process (PID ${VLLM_PID})"
-    kill "${VLLM_PID}" 2>/dev/null || true
-    exit 1
-fi
-# ── Start FastAPI with vLLM cleanup on exit ──────────────────────────────────
-_cleanup() {
-    echo "[startup] Shutting down vLLM (PID ${VLLM_PID})"
-    kill "${VLLM_PID}" 2>/dev/null
-    wait "${VLLM_PID}" 2>/dev/null
-}
-trap _cleanup EXIT TERM INT
-echo "[startup] Starting FastAPI server on 0.0.0.0:7860"
-python3 -m uvicorn app:app \
-    --host 0.0.0.0 \
-    --port 7860 \
-    --workers 1 \
-    --timeout-keep-alive 300

+#!/bin/bash
+# Start the PaddleOCR-VL + Gemini hybrid parser API
+# Single process: FastAPI with PaddleOCR-VL-1.5 loaded in-process
+# Note: Dockerfile should ensure this script is executable (chmod +x)
+# Start FastAPI
+exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1