Spaces:
Running on T4
Running on T4
Commit Β·
16b2195
1
Parent(s): ba23da1
feat: v5.0.0 PaddleOCR-VL-1.5 + Gemini hybrid architecture
Browse files- Replace Qwen3-VL + Docling with PaddleOCR-VL-1.5 (0.9B params, #1 OmniDocBench 94.5%)
- Keep Gemini 3 Flash for table page enhancement only
- Split monolithic app.py into 8 focused modules
- Switch from A100 to T4 GPU (84% cost reduction)
- Native cross-page table merging via PP-DocLayoutV2
- Enhanced post-processing: footer/artifact removal, table cleanup
- Dockerfile +41 -38
- app.py +51 -1556
- auth.py +89 -0
- config.py +37 -0
- gemini.py +132 -0
- models.py +40 -0
- pipeline.py +210 -0
- postprocess.py +341 -0
- rendering.py +112 -0
- requirements.txt +9 -15
- start.sh +6 -83
Dockerfile
CHANGED
|
@@ -1,9 +1,13 @@
|
|
| 1 |
-
# Hugging Face Spaces Dockerfile for
|
| 2 |
-
# GPU-accelerated document parsing with
|
| 3 |
-
# Build:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
#
|
| 6 |
-
FROM
|
| 7 |
|
| 8 |
USER root
|
| 9 |
|
|
@@ -12,18 +16,26 @@ RUN echo "========== BUILD STARTED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') ======
|
|
| 12 |
# Install system dependencies
|
| 13 |
RUN echo "========== STEP 1: Installing system dependencies ==========" && \
|
| 14 |
apt-get update && apt-get install -y --no-install-recommends \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Fonts for document rendering
|
| 16 |
fonts-noto-core \
|
| 17 |
fonts-noto-cjk \
|
| 18 |
fontconfig \
|
| 19 |
-
# Image processing
|
| 20 |
libgl1 \
|
| 21 |
libglib2.0-0 \
|
| 22 |
-
# PDF utilities
|
| 23 |
poppler-utils \
|
| 24 |
# Health checks
|
| 25 |
curl \
|
| 26 |
&& fc-cache -fv && \
|
|
|
|
|
|
|
|
|
|
| 27 |
rm -rf /var/lib/apt/lists/* && \
|
| 28 |
echo "========== System dependencies installed =========="
|
| 29 |
|
|
@@ -33,24 +45,17 @@ RUN useradd -m -u 1000 user
|
|
| 33 |
# Set environment variables
|
| 34 |
ENV PYTHONUNBUFFERED=1 \
|
| 35 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 36 |
-
VLM_MODEL=Qwen/Qwen3-VL-30B-A3B-Instruct \
|
| 37 |
-
VLM_HOST=127.0.0.1 \
|
| 38 |
-
VLM_PORT=8000 \
|
| 39 |
-
VLM_GPU_MEMORY_UTILIZATION=0.85 \
|
| 40 |
-
VLM_MAX_MODEL_LEN=65536 \
|
| 41 |
IMAGES_SCALE=2.0 \
|
| 42 |
MAX_FILE_SIZE_MB=1024 \
|
| 43 |
HF_HOME=/home/user/.cache/huggingface \
|
| 44 |
-
TORCH_HOME=/home/user/.cache/torch \
|
| 45 |
XDG_CACHE_HOME=/home/user/.cache \
|
| 46 |
HOME=/home/user \
|
| 47 |
-
PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
|
| 48 |
-
LD_LIBRARY_PATH=/home/user/.local/lib/python3.12/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
|
| 49 |
|
| 50 |
# Create cache directories with correct ownership
|
| 51 |
RUN echo "========== STEP 2: Creating cache directories ==========" && \
|
| 52 |
mkdir -p /home/user/.cache/huggingface \
|
| 53 |
-
/home/user/.cache/
|
| 54 |
/home/user/app && \
|
| 55 |
chown -R user:user /home/user && \
|
| 56 |
echo "========== Cache directories created =========="
|
|
@@ -62,30 +67,29 @@ WORKDIR /home/user/app
|
|
| 62 |
# Copy requirements first for better caching
|
| 63 |
COPY --chown=user:user requirements.txt .
|
| 64 |
|
| 65 |
-
# Install
|
| 66 |
-
RUN echo "========== STEP 3: Installing
|
| 67 |
-
pip install --user --upgrade pip && \
|
| 68 |
-
pip install --user
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
echo "Installed packages:" && \
|
| 71 |
pip list --user && \
|
| 72 |
echo "========== Python dependencies installed =========="
|
| 73 |
|
| 74 |
-
# Pre-download
|
| 75 |
-
RUN echo "========== STEP
|
| 76 |
-
|
| 77 |
echo "Model cache summary:" && \
|
|
|
|
| 78 |
du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
|
| 79 |
-
echo "========== Qwen3-VL-30B-A3B model downloaded =========="
|
| 80 |
-
|
| 81 |
-
# Pre-download Docling models
|
| 82 |
-
RUN echo "========== STEP 5: Pre-downloading Docling models ==========" && \
|
| 83 |
-
python3 -c "from docling.document_converter import DocumentConverter; print('Downloading Docling models...'); converter = DocumentConverter(); print('Done')" && \
|
| 84 |
-
echo "Model cache summary:" && \
|
| 85 |
-
du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
|
| 86 |
-
du -sh /home/user/.cache/torch 2>/dev/null || echo " Torch cache: (empty)" && \
|
| 87 |
du -sh /home/user/.cache 2>/dev/null || echo " Total cache: (empty)" && \
|
| 88 |
-
echo "==========
|
| 89 |
|
| 90 |
# Copy application code
|
| 91 |
COPY --chown=user:user . .
|
|
@@ -95,13 +99,12 @@ RUN echo "========== STEP 6: Finalizing build ==========" && \
|
|
| 95 |
echo "Files in app directory:" && ls -la /home/user/app/ && \
|
| 96 |
echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="
|
| 97 |
|
| 98 |
-
# Expose the port
|
| 99 |
EXPOSE 7860
|
| 100 |
|
| 101 |
-
# Health check
|
| 102 |
-
HEALTHCHECK --interval=30s --timeout=30s --start-period=
|
| 103 |
CMD curl -f http://localhost:7860/ || exit 1
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
ENTRYPOINT []
|
| 107 |
CMD ["/bin/bash", "/home/user/app/start.sh"]
|
|
|
|
| 1 |
+
# Hugging Face Spaces Dockerfile for PaddleOCR-VL Document Parser API
|
| 2 |
+
# GPU-accelerated document parsing with PaddleOCR-VL-1.5 + PaddlePaddle
|
| 3 |
+
# Build: v5.0.0 - PaddleOCR-VL for high-quality OCR on Nvidia T4
|
| 4 |
+
#
|
| 5 |
+
# NOTE: Run with --shm-size 16g for PaddlePaddle shared memory:
|
| 6 |
+
# docker build -t hf-docling .
|
| 7 |
+
# docker run --gpus all --shm-size 16g -p 7860:7860 -e API_TOKEN=test hf-docling
|
| 8 |
|
| 9 |
+
# CUDA 12.6 runtime with cuDNN (required by PaddlePaddle GPU)
|
| 10 |
+
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
|
| 11 |
|
| 12 |
USER root
|
| 13 |
|
|
|
|
| 16 |
# Install system dependencies
|
| 17 |
RUN echo "========== STEP 1: Installing system dependencies ==========" && \
|
| 18 |
apt-get update && apt-get install -y --no-install-recommends \
|
| 19 |
+
# Python 3.11
|
| 20 |
+
python3.11 \
|
| 21 |
+
python3.11-venv \
|
| 22 |
+
python3.11-dev \
|
| 23 |
+
python3-pip \
|
| 24 |
# Fonts for document rendering
|
| 25 |
fonts-noto-core \
|
| 26 |
fonts-noto-cjk \
|
| 27 |
fontconfig \
|
| 28 |
+
# Image processing (required by OpenCV)
|
| 29 |
libgl1 \
|
| 30 |
libglib2.0-0 \
|
| 31 |
+
# PDF utilities (required by pdf2image)
|
| 32 |
poppler-utils \
|
| 33 |
# Health checks
|
| 34 |
curl \
|
| 35 |
&& fc-cache -fv && \
|
| 36 |
+
# Set python3.11 as default python3/python
|
| 37 |
+
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
|
| 38 |
+
update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
|
| 39 |
rm -rf /var/lib/apt/lists/* && \
|
| 40 |
echo "========== System dependencies installed =========="
|
| 41 |
|
|
|
|
| 45 |
# Set environment variables
|
| 46 |
ENV PYTHONUNBUFFERED=1 \
|
| 47 |
PYTHONDONTWRITEBYTECODE=1 \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
IMAGES_SCALE=2.0 \
|
| 49 |
MAX_FILE_SIZE_MB=1024 \
|
| 50 |
HF_HOME=/home/user/.cache/huggingface \
|
|
|
|
| 51 |
XDG_CACHE_HOME=/home/user/.cache \
|
| 52 |
HOME=/home/user \
|
| 53 |
+
PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
|
|
|
|
| 54 |
|
| 55 |
# Create cache directories with correct ownership
|
| 56 |
RUN echo "========== STEP 2: Creating cache directories ==========" && \
|
| 57 |
mkdir -p /home/user/.cache/huggingface \
|
| 58 |
+
/home/user/.cache/paddleocr \
|
| 59 |
/home/user/app && \
|
| 60 |
chown -R user:user /home/user && \
|
| 61 |
echo "========== Cache directories created =========="
|
|
|
|
| 67 |
# Copy requirements first for better caching
|
| 68 |
COPY --chown=user:user requirements.txt .
|
| 69 |
|
| 70 |
+
# Install PaddlePaddle GPU (must be installed before paddleocr)
|
| 71 |
+
RUN echo "========== STEP 3: Installing PaddlePaddle GPU ==========" && \
|
| 72 |
+
python -m pip install --user --upgrade pip && \
|
| 73 |
+
python -m pip install --user paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ && \
|
| 74 |
+
echo "PaddlePaddle version:" && \
|
| 75 |
+
python -c "import paddle; print(paddle.__version__); print('CUDA:', paddle.is_compiled_with_cuda())" && \
|
| 76 |
+
echo "========== PaddlePaddle GPU installed =========="
|
| 77 |
+
|
| 78 |
+
# Install Python dependencies from requirements.txt
|
| 79 |
+
RUN echo "========== STEP 4: Installing Python dependencies ==========" && \
|
| 80 |
+
python -m pip install --user -r requirements.txt && \
|
| 81 |
echo "Installed packages:" && \
|
| 82 |
pip list --user && \
|
| 83 |
echo "========== Python dependencies installed =========="
|
| 84 |
|
| 85 |
+
# Pre-download PaddleOCR-VL-1.5 model at build time (avoids download on first request)
|
| 86 |
+
RUN echo "========== STEP 5: Pre-downloading PaddleOCR-VL-1.5 model ==========" && \
|
| 87 |
+
python -c "from paddleocr import PaddleOCRVL; PaddleOCRVL()" && \
|
| 88 |
echo "Model cache summary:" && \
|
| 89 |
+
du -sh /home/user/.cache/paddleocr 2>/dev/null || echo " PaddleOCR cache: (empty)" && \
|
| 90 |
du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
du -sh /home/user/.cache 2>/dev/null || echo " Total cache: (empty)" && \
|
| 92 |
+
echo "========== PaddleOCR-VL-1.5 model downloaded =========="
|
| 93 |
|
| 94 |
# Copy application code
|
| 95 |
COPY --chown=user:user . .
|
|
|
|
| 99 |
echo "Files in app directory:" && ls -la /home/user/app/ && \
|
| 100 |
echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="
|
| 101 |
|
| 102 |
+
# Expose the port (HF Spaces standard)
|
| 103 |
EXPOSE 7860
|
| 104 |
|
| 105 |
+
# Health check
|
| 106 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=120s --retries=5 \
|
| 107 |
CMD curl -f http://localhost:7860/ || exit 1
|
| 108 |
|
| 109 |
+
# Single-process FastAPI app (no vLLM sidecar needed)
|
|
|
|
| 110 |
CMD ["/bin/bash", "/home/user/app/start.sh"]
|
app.py
CHANGED
|
@@ -1,1512 +1,54 @@
|
|
| 1 |
"""
|
| 2 |
-
Docling VLM Parser API
|
| 3 |
-
|
| 4 |
-
A FastAPI service using a
|
| 5 |
-
Pass 1 (GPU):
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
-
|
| 13 |
-
-
|
| 14 |
-
-
|
| 15 |
-
- Quality: DPI 200 for clear page images sent to Gemini
|
| 16 |
-
- Quality: Post-processing removes cross-page artifacts, deduplicates, cleans tables
|
| 17 |
"""
|
| 18 |
|
| 19 |
import asyncio
|
| 20 |
-
import base64
|
| 21 |
-
import io
|
| 22 |
-
import ipaddress
|
| 23 |
-
import logging
|
| 24 |
-
import os
|
| 25 |
import re
|
| 26 |
-
import secrets
|
| 27 |
import shutil
|
| 28 |
-
import socket
|
| 29 |
import tempfile
|
| 30 |
import time
|
| 31 |
-
import zipfile
|
| 32 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 33 |
from contextlib import asynccontextmanager
|
| 34 |
from pathlib import Path
|
| 35 |
-
from typing import
|
| 36 |
-
from urllib.parse import urlparse
|
| 37 |
from uuid import uuid4
|
| 38 |
|
| 39 |
-
import cv2
|
| 40 |
import httpx
|
| 41 |
-
import torch
|
| 42 |
from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
|
| 43 |
-
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
| 44 |
-
from pdf2image import convert_from_path
|
| 45 |
-
from pydantic import BaseModel
|
| 46 |
-
|
| 47 |
-
# Docling imports
|
| 48 |
-
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
| 49 |
-
from docling.datamodel.base_models import InputFormat
|
| 50 |
-
from docling.datamodel.document import PictureItem, TableItem
|
| 51 |
-
from docling.datamodel.pipeline_options import (
|
| 52 |
-
AcceleratorOptions,
|
| 53 |
-
PdfPipelineOptions,
|
| 54 |
-
RapidOcrOptions,
|
| 55 |
-
TableFormerMode,
|
| 56 |
-
)
|
| 57 |
-
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 58 |
-
|
| 59 |
-
# Configure logging
|
| 60 |
-
logging.basicConfig(
|
| 61 |
-
level=logging.INFO,
|
| 62 |
-
format="%(asctime)s | %(levelname)-8s | %(message)s",
|
| 63 |
-
datefmt="%Y-%m-%d %H:%M:%S",
|
| 64 |
-
)
|
| 65 |
-
logger = logging.getLogger("docling-parser")
|
| 66 |
-
|
| 67 |
-
# Security
|
| 68 |
-
API_TOKEN = os.getenv("API_TOKEN")
|
| 69 |
-
security = HTTPBearer()
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
|
| 73 |
-
"""Verify the API token from Authorization header."""
|
| 74 |
-
if not API_TOKEN:
|
| 75 |
-
raise HTTPException(
|
| 76 |
-
status_code=500,
|
| 77 |
-
detail="No API token configured on server",
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
token = credentials.credentials
|
| 81 |
-
if not secrets.compare_digest(token, API_TOKEN):
|
| 82 |
-
raise HTTPException(
|
| 83 |
-
status_code=401,
|
| 84 |
-
detail="Invalid API token",
|
| 85 |
-
)
|
| 86 |
-
return token
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
# VLM Configuration
|
| 90 |
-
VLM_MODEL = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-30B-A3B-Instruct")
|
| 91 |
-
VLM_HOST = os.getenv("VLM_HOST", "127.0.0.1")
|
| 92 |
-
VLM_PORT = os.getenv("VLM_PORT", "8000")
|
| 93 |
-
IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
|
| 94 |
-
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
|
| 95 |
-
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
|
| 96 |
-
VLM_TIMEOUT = float(os.getenv("VLM_TIMEOUT", "300"))
|
| 97 |
-
VLM_CONCURRENCY = int(os.getenv("VLM_CONCURRENCY", "4"))
|
| 98 |
-
RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
|
| 99 |
-
|
| 100 |
-
# Gemini API Configuration (for table page extraction)
|
| 101 |
-
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 102 |
-
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
|
| 103 |
-
GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
|
| 104 |
-
GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "4"))
|
| 105 |
-
|
| 106 |
-
# Blocked hostnames for SSRF protection
|
| 107 |
-
BLOCKED_HOSTNAMES = {
|
| 108 |
-
"localhost",
|
| 109 |
-
"metadata",
|
| 110 |
-
"metadata.google.internal",
|
| 111 |
-
"metadata.google",
|
| 112 |
-
"169.254.169.254",
|
| 113 |
-
"fd00:ec2::254",
|
| 114 |
-
}
|
| 115 |
-
|
| 116 |
-
# Global converter instance (initialized on startup)
|
| 117 |
-
_converter: Optional[DocumentConverter] = None
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
def _get_device() -> str:
|
| 121 |
-
"""Get the best available device for processing."""
|
| 122 |
-
if torch.cuda.is_available():
|
| 123 |
-
return "cuda"
|
| 124 |
-
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 125 |
-
return "mps"
|
| 126 |
-
return "cpu"
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
def _validate_url(url: str) -> None:
|
| 130 |
-
"""Validate URL to prevent SSRF attacks."""
|
| 131 |
-
try:
|
| 132 |
-
parsed = urlparse(url)
|
| 133 |
-
except Exception as e:
|
| 134 |
-
raise HTTPException(
|
| 135 |
-
status_code=400,
|
| 136 |
-
detail=f"Invalid URL format: {str(e)}",
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
if parsed.scheme not in ("http", "https"):
|
| 140 |
-
raise HTTPException(
|
| 141 |
-
status_code=400,
|
| 142 |
-
detail=f"Invalid URL scheme '{parsed.scheme}'. Only http and https are allowed.",
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
hostname = parsed.hostname
|
| 146 |
-
if not hostname:
|
| 147 |
-
raise HTTPException(
|
| 148 |
-
status_code=400,
|
| 149 |
-
detail="Invalid URL: missing hostname.",
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
hostname_lower = hostname.lower()
|
| 153 |
-
if hostname_lower in BLOCKED_HOSTNAMES:
|
| 154 |
-
raise HTTPException(
|
| 155 |
-
status_code=400,
|
| 156 |
-
detail="Access to internal/metadata services is not allowed.",
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
blocked_patterns = ["metadata", "internal", "localhost", "127.0.0.1", "::1"]
|
| 160 |
-
for pattern in blocked_patterns:
|
| 161 |
-
if pattern in hostname_lower:
|
| 162 |
-
raise HTTPException(
|
| 163 |
-
status_code=400,
|
| 164 |
-
detail="Access to internal/metadata services is not allowed.",
|
| 165 |
-
)
|
| 166 |
-
|
| 167 |
-
try:
|
| 168 |
-
ip_str = socket.gethostbyname(hostname)
|
| 169 |
-
ip = ipaddress.ip_address(ip_str)
|
| 170 |
-
except socket.gaierror:
|
| 171 |
-
raise HTTPException(
|
| 172 |
-
status_code=400,
|
| 173 |
-
detail=f"Could not resolve hostname: {hostname}",
|
| 174 |
-
)
|
| 175 |
-
except ValueError as e:
|
| 176 |
-
raise HTTPException(
|
| 177 |
-
status_code=400,
|
| 178 |
-
detail=f"Invalid IP address resolved: {str(e)}",
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
|
| 182 |
-
raise HTTPException(
|
| 183 |
-
status_code=400,
|
| 184 |
-
detail="Access to private/internal IP addresses is not allowed.",
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
|
| 189 |
-
"""Sync helper to save uploaded file to disk."""
|
| 190 |
-
with open(input_path, "wb") as f:
|
| 191 |
-
shutil.copyfileobj(file_obj, f)
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
def _save_downloaded_content(input_path: Path, content: bytes) -> None:
|
| 195 |
-
"""Sync helper to save downloaded content to disk."""
|
| 196 |
-
with open(input_path, "wb") as f:
|
| 197 |
-
f.write(content)
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
# ---------------------------------------------------------------------------
|
| 201 |
-
# Pydantic Models
|
| 202 |
-
# ---------------------------------------------------------------------------
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
class ParseResponse(BaseModel):
|
| 206 |
-
"""Response model for document parsing."""
|
| 207 |
-
|
| 208 |
-
success: bool
|
| 209 |
-
markdown: Optional[str] = None
|
| 210 |
-
json_content: Optional[Union[dict, list]] = None
|
| 211 |
-
images_zip: Optional[str] = None
|
| 212 |
-
image_count: int = 0
|
| 213 |
-
error: Optional[str] = None
|
| 214 |
-
pages_processed: int = 0
|
| 215 |
-
device_used: Optional[str] = None
|
| 216 |
-
vlm_model: Optional[str] = None
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
class HealthResponse(BaseModel):
|
| 220 |
-
"""Health check response."""
|
| 221 |
-
|
| 222 |
-
status: str
|
| 223 |
-
version: str
|
| 224 |
-
device: str
|
| 225 |
-
gpu_name: Optional[str] = None
|
| 226 |
-
vlm_model: str = ""
|
| 227 |
-
vlm_status: str = "unknown"
|
| 228 |
-
images_scale: float = 2.0
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
class URLParseRequest(BaseModel):
|
| 232 |
-
"""Request model for URL-based parsing."""
|
| 233 |
-
|
| 234 |
-
url: str
|
| 235 |
-
output_format: str = "markdown"
|
| 236 |
-
images_scale: Optional[float] = None
|
| 237 |
-
start_page: int = 0
|
| 238 |
-
end_page: Optional[int] = None
|
| 239 |
-
include_images: bool = False
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
# ---------------------------------------------------------------------------
|
| 243 |
-
# OpenCV Image Preprocessing (CLAHE only β fast)
|
| 244 |
-
# ---------------------------------------------------------------------------
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
def _preprocess_image_for_ocr(image_path: str) -> str:
|
| 248 |
-
"""Enhance image quality for better OCR accuracy.
|
| 249 |
-
|
| 250 |
-
Applies CLAHE contrast enhancement only (fast).
|
| 251 |
-
Denoising was removed in v3.2.1 β it added ~10s/page with minimal
|
| 252 |
-
benefit for VLM-based OCR which handles noise well.
|
| 253 |
-
"""
|
| 254 |
-
img = cv2.imread(image_path)
|
| 255 |
-
if img is None:
|
| 256 |
-
return image_path
|
| 257 |
-
|
| 258 |
-
# CLAHE contrast enhancement on L channel
|
| 259 |
-
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
|
| 260 |
-
l, a, b = cv2.split(lab)
|
| 261 |
-
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 262 |
-
l = clahe.apply(l)
|
| 263 |
-
lab = cv2.merge([l, a, b])
|
| 264 |
-
img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
|
| 265 |
-
|
| 266 |
-
cv2.imwrite(image_path, img)
|
| 267 |
-
return image_path
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
# ---------------------------------------------------------------------------
|
| 271 |
-
# VLM OCR with retry
|
| 272 |
-
# ---------------------------------------------------------------------------
|
| 273 |
-
|
| 274 |
-
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 275 |
-
_THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
| 276 |
-
|
| 277 |
-
# Post-processing patterns for VLM output cleanup
|
| 278 |
-
_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
|
| 279 |
-
_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
|
| 280 |
-
_HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
|
| 281 |
-
_PAGE_N_PATTERN = re.compile(r"^\s*Page\s+\d+\s*$\n?", re.MULTILINE)
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
def _clean_vlm_output(content: str) -> str:
|
| 285 |
-
"""Post-process VLM output to clean artifacts.
|
| 286 |
-
|
| 287 |
-
Removes: code fences, HTML comments, 'Page N' artifacts,
|
| 288 |
-
and converts any remaining LaTeX tables to markdown format.
|
| 289 |
-
"""
|
| 290 |
-
# Strip <think> blocks
|
| 291 |
-
content = _THINK_PATTERN.sub("", content).strip()
|
| 292 |
-
|
| 293 |
-
# Strip code fence wrappers
|
| 294 |
-
content = _CODE_FENCE_PATTERN.sub("", content)
|
| 295 |
-
content = _CODE_FENCE_END.sub("", content)
|
| 296 |
-
|
| 297 |
-
# Strip HTML comments (VLM sometimes adds coordinate annotations)
|
| 298 |
-
content = _HTML_COMMENT_PATTERN.sub("", content)
|
| 299 |
-
|
| 300 |
-
# Strip "Page N" artifacts
|
| 301 |
-
content = _PAGE_N_PATTERN.sub("", content)
|
| 302 |
-
|
| 303 |
-
# Fix escaped quotes (VLM sometimes escapes them unnecessarily)
|
| 304 |
-
content = content.replace('\\"', '"')
|
| 305 |
-
|
| 306 |
-
# Convert LaTeX tables to markdown if VLM ignores the prompt
|
| 307 |
-
content = _convert_latex_tables_to_markdown(content)
|
| 308 |
-
|
| 309 |
-
return content.strip()
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
# Split on \\
|
| 323 |
-
rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
|
| 324 |
-
if not rows:
|
| 325 |
-
return match.group(0)
|
| 326 |
-
|
| 327 |
-
md_rows = []
|
| 328 |
-
for i, row in enumerate(rows):
|
| 329 |
-
cells = [c.strip() for c in row.split("&")]
|
| 330 |
-
md_row = "| " + " | ".join(cells) + " |"
|
| 331 |
-
md_rows.append(md_row)
|
| 332 |
-
if i == 0:
|
| 333 |
-
# Add separator after header
|
| 334 |
-
sep = "| " + " | ".join(["---"] * len(cells)) + " |"
|
| 335 |
-
md_rows.append(sep)
|
| 336 |
-
|
| 337 |
-
return "\n".join(md_rows)
|
| 338 |
-
|
| 339 |
-
return latex_pattern.sub(_latex_to_md, text)
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
# ---------------------------------------------------------------------------
|
| 343 |
-
# Post-Processing: Cross-page artifact removal (applied AFTER page merge)
|
| 344 |
-
# ---------------------------------------------------------------------------
|
| 345 |
-
|
| 346 |
-
# Day-of-week date lines (e.g., "Thursday, October 31, 2024")
|
| 347 |
-
_STANDALONE_DATE = re.compile(
|
| 348 |
-
r"^\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
|
| 349 |
-
r"(?:January|February|March|April|May|June|July|August|September|"
|
| 350 |
-
r"October|November|December)\s+\d{1,2},\s+\d{4}\s*$",
|
| 351 |
-
re.MULTILINE,
|
| 352 |
-
)
|
| 353 |
-
# Standalone time (e.g., "11:30 AM")
|
| 354 |
-
_STANDALONE_TIME = re.compile(r"^\s*\d{1,2}:\d{2}\s*(?:AM|PM)\s*$", re.MULTILINE)
|
| 355 |
-
# Page footer patterns: "N | address" or "N address N" (e.g., "2 | 8575 W Golf Rd, Niles, IL 60714 | 3")
|
| 356 |
-
_PAGE_FOOTER = re.compile(
|
| 357 |
-
r"^\s*\d{1,3}\s*\|?\s*\d{2,5}\s+\w.*(?:Rd|St|Ave|Blvd|Dr|Ln|Way|Ct)\b.*\d{5}.*$",
|
| 358 |
-
re.MULTILINE,
|
| 359 |
-
)
|
| 360 |
-
# Standalone page number lines (e.g., "12" alone on a line)
|
| 361 |
-
_STANDALONE_PAGE_NUM = re.compile(r"^\s*\d{1,3}\s*$", re.MULTILINE)
|
| 362 |
-
# Numbered section pattern: "N. TITLE" where N is 1-99 and TITLE is mostly uppercase
|
| 363 |
-
_NUMBERED_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][A-Z\s\-/&,]+(?:\.\s*)?)")
|
| 364 |
-
# Table row with ALL empty cells (e.g., "| | | | |")
|
| 365 |
-
_EMPTY_TABLE_ROW = re.compile(r"^\|(?:\s*\|)+\s*$", re.MULTILINE)
|
| 366 |
-
# Trailing empty cells in a table row (e.g., "| data | data | | | |")
|
| 367 |
-
_TRAILING_EMPTY_CELLS = re.compile(r"(?:\s*\|\s*){2,}\s*$")
|
| 368 |
-
# Table separator row (e.g., "|---|---|---|")
|
| 369 |
-
_TABLE_SEP_ROW = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$")
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
def _post_process_merged_markdown(content: str) -> str:
|
| 373 |
-
"""Post-process merged multi-page markdown to fix cross-page artifacts.
|
| 374 |
-
|
| 375 |
-
Applied after all pages are concatenated. Fixes:
|
| 376 |
-
- Duplicate document headings (VLM re-extracts page headers)
|
| 377 |
-
- Duplicate short metadata lines (subtitles, dates repeated per page)
|
| 378 |
-
- Page footer/header artifacts (standalone dates, times, page numbers)
|
| 379 |
-
- Numbered section heading normalization (consistent ## levels)
|
| 380 |
-
- Table artifacts (empty rows, trailing empty cells)
|
| 381 |
-
- Cross-page table continuations (merge split tables)
|
| 382 |
-
- Excessive whitespace
|
| 383 |
-
"""
|
| 384 |
-
content = _deduplicate_headings(content)
|
| 385 |
-
content = _deduplicate_short_blocks(content)
|
| 386 |
-
content = _remove_page_boundary_artifacts(content)
|
| 387 |
-
content = _normalize_numbered_headings(content)
|
| 388 |
-
content = _clean_table_artifacts(content)
|
| 389 |
-
content = _merge_split_tables(content)
|
| 390 |
-
# Normalize runs of 4+ newlines to 3
|
| 391 |
-
content = re.sub(r"\n{4,}", "\n\n\n", content)
|
| 392 |
-
return content.strip()
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
def _deduplicate_headings(content: str) -> str:
|
| 396 |
-
"""Remove duplicate heading lines, keeping only the first occurrence.
|
| 397 |
-
|
| 398 |
-
When VLM processes each page, it may re-extract page headers/document titles.
|
| 399 |
-
This removes exact duplicate headings while preserving table rows and body text.
|
| 400 |
-
"""
|
| 401 |
-
lines = content.split("\n")
|
| 402 |
-
seen_headings: set[str] = set()
|
| 403 |
-
result: list[str] = []
|
| 404 |
-
|
| 405 |
-
for line in lines:
|
| 406 |
-
stripped = line.strip()
|
| 407 |
-
if stripped.startswith("#"):
|
| 408 |
-
# Normalize heading for comparison (lowercase, strip trailing #)
|
| 409 |
-
key = stripped.lstrip("#").strip().lower()
|
| 410 |
-
if key and key in seen_headings:
|
| 411 |
-
continue # Skip duplicate heading
|
| 412 |
-
if key:
|
| 413 |
-
seen_headings.add(key)
|
| 414 |
-
result.append(line)
|
| 415 |
-
|
| 416 |
-
return "\n".join(result)
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
def _deduplicate_short_blocks(content: str) -> str:
|
| 420 |
-
"""Remove duplicate short text blocks that repeat across pages.
|
| 421 |
-
|
| 422 |
-
When VLM processes each page, it may re-extract document subtitles,
|
| 423 |
-
metadata lines, and other short repeating text. This removes exact
|
| 424 |
-
duplicates of short non-table blocks (< 120 chars).
|
| 425 |
-
"""
|
| 426 |
-
blocks = content.split("\n\n")
|
| 427 |
-
seen: set[str] = set()
|
| 428 |
-
result: list[str] = []
|
| 429 |
-
|
| 430 |
-
for block in blocks:
|
| 431 |
-
stripped = block.strip()
|
| 432 |
-
if not stripped:
|
| 433 |
-
result.append(block)
|
| 434 |
-
continue
|
| 435 |
-
|
| 436 |
-
# Only deduplicate short, non-table, non-heading blocks
|
| 437 |
-
is_table = stripped.startswith("|") and "|" in stripped[1:]
|
| 438 |
-
is_heading = stripped.startswith("#")
|
| 439 |
-
if is_table or is_heading or len(stripped) > 120:
|
| 440 |
-
result.append(block)
|
| 441 |
-
continue
|
| 442 |
-
|
| 443 |
-
key = stripped.lower()
|
| 444 |
-
if key in seen:
|
| 445 |
-
continue # Skip duplicate short block
|
| 446 |
-
|
| 447 |
-
seen.add(key)
|
| 448 |
-
result.append(block)
|
| 449 |
-
|
| 450 |
-
return "\n\n".join(result)
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
def _remove_page_boundary_artifacts(content: str) -> str:
|
| 454 |
-
"""Remove page footer/header artifacts like standalone dates, times, page numbers, and footers."""
|
| 455 |
-
content = _STANDALONE_DATE.sub("", content)
|
| 456 |
-
content = _STANDALONE_TIME.sub("", content)
|
| 457 |
-
content = _PAGE_FOOTER.sub("", content)
|
| 458 |
-
content = _STANDALONE_PAGE_NUM.sub("", content)
|
| 459 |
-
return content
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
def _normalize_numbered_headings(content: str) -> str:
|
| 463 |
-
"""Normalize numbered section headings to consistent ## level.
|
| 464 |
-
|
| 465 |
-
VLM inconsistently formats numbered sections like "3. OCCUPANCY" β
|
| 466 |
-
some get ## headings, some are plain text. This detects the pattern
|
| 467 |
-
and ensures all numbered sections at the same level use ## headings.
|
| 468 |
-
"""
|
| 469 |
-
lines = content.split("\n")
|
| 470 |
-
result: list[str] = []
|
| 471 |
-
|
| 472 |
-
# First pass: detect which numbered sections exist and their heading status
|
| 473 |
-
sections_with_heading: set[int] = set()
|
| 474 |
-
sections_without_heading: set[int] = set()
|
| 475 |
-
|
| 476 |
-
for line in lines:
|
| 477 |
-
stripped = line.strip()
|
| 478 |
-
# Already a heading like "## 3. OCCUPANCY"
|
| 479 |
-
heading_match = re.match(r"^#{1,3}\s+(\d{1,2})\.\s+[A-Z]", stripped)
|
| 480 |
-
if heading_match:
|
| 481 |
-
sections_with_heading.add(int(heading_match.group(1)))
|
| 482 |
-
continue
|
| 483 |
-
# Plain text like "3. OCCUPANCY. Tenant shall..."
|
| 484 |
-
plain_match = _NUMBERED_SECTION.match(stripped)
|
| 485 |
-
if plain_match:
|
| 486 |
-
sections_without_heading.add(int(plain_match.group(1)))
|
| 487 |
-
|
| 488 |
-
# If there's a mix of headed and non-headed numbered sections, normalize
|
| 489 |
-
if sections_with_heading and sections_without_heading:
|
| 490 |
-
for i, line in enumerate(lines):
|
| 491 |
-
stripped = line.strip()
|
| 492 |
-
# Check if this is a non-headed numbered section that should be a heading
|
| 493 |
-
plain_match = _NUMBERED_SECTION.match(stripped)
|
| 494 |
-
if plain_match:
|
| 495 |
-
section_num = int(plain_match.group(1))
|
| 496 |
-
if section_num in sections_without_heading:
|
| 497 |
-
# Check that it looks like a section start (followed by text)
|
| 498 |
-
# Split at the first sentence end to make the heading
|
| 499 |
-
# Extract just "N. TITLE." as heading, keep body text
|
| 500 |
-
title_end = plain_match.end()
|
| 501 |
-
title = stripped[:title_end].rstrip(".")
|
| 502 |
-
body = stripped[title_end:].strip()
|
| 503 |
-
if body:
|
| 504 |
-
result.append(f"## {title}")
|
| 505 |
-
result.append(body)
|
| 506 |
-
else:
|
| 507 |
-
result.append(f"## {title}")
|
| 508 |
-
continue
|
| 509 |
-
result.append(line)
|
| 510 |
-
else:
|
| 511 |
-
result = lines
|
| 512 |
-
|
| 513 |
-
return "\n".join(result)
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
def _clean_table_artifacts(content: str) -> str:
|
| 517 |
-
"""Clean table formatting artifacts.
|
| 518 |
-
|
| 519 |
-
- Removes table rows where ALL cells are empty
|
| 520 |
-
- Strips trailing empty cells from table rows
|
| 521 |
-
- Removes orphaned separator rows not preceded by a header
|
| 522 |
-
"""
|
| 523 |
-
lines = content.split("\n")
|
| 524 |
-
result: list[str] = []
|
| 525 |
-
|
| 526 |
-
for i, line in enumerate(lines):
|
| 527 |
-
stripped = line.strip()
|
| 528 |
-
|
| 529 |
-
# Skip completely empty table rows (| | | | |)
|
| 530 |
-
if _EMPTY_TABLE_ROW.match(stripped):
|
| 531 |
-
continue
|
| 532 |
-
|
| 533 |
-
# Clean trailing empty cells from table data rows
|
| 534 |
-
if stripped.startswith("|") and "|" in stripped[1:]:
|
| 535 |
-
# Don't touch separator rows
|
| 536 |
-
if not _TABLE_SEP_ROW.match(stripped):
|
| 537 |
-
# Remove trailing empty cells
|
| 538 |
-
cleaned = _TRAILING_EMPTY_CELLS.sub(" |", stripped)
|
| 539 |
-
result.append(cleaned)
|
| 540 |
-
continue
|
| 541 |
-
|
| 542 |
-
result.append(line)
|
| 543 |
-
|
| 544 |
-
return "\n".join(result)
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
def _is_table_line(line: str) -> bool:
|
| 548 |
-
"""Check if a line is a markdown table row or separator."""
|
| 549 |
-
s = line.strip()
|
| 550 |
-
return bool(s.startswith("|") and s.endswith("|") and s.count("|") >= 3)
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
def _count_columns(line: str) -> int:
|
| 554 |
-
"""Count the number of columns in a table row."""
|
| 555 |
-
s = line.strip()
|
| 556 |
-
if not s.startswith("|"):
|
| 557 |
-
return 0
|
| 558 |
-
# Split by | and count non-boundary segments
|
| 559 |
-
parts = s.split("|")
|
| 560 |
-
# First and last are empty strings from leading/trailing |
|
| 561 |
-
return max(0, len(parts) - 2)
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
def _merge_split_tables(content: str) -> str:
|
| 565 |
-
"""Merge table continuations that were split across pages.
|
| 566 |
-
|
| 567 |
-
Detects when non-table content (whitespace, duplicate metadata) separates
|
| 568 |
-
what should be a single table, and merges the data rows.
|
| 569 |
-
"""
|
| 570 |
-
lines = content.split("\n")
|
| 571 |
-
result: list[str] = []
|
| 572 |
-
i = 0
|
| 573 |
-
|
| 574 |
-
while i < len(lines):
|
| 575 |
-
result.append(lines[i])
|
| 576 |
-
i += 1
|
| 577 |
-
|
| 578 |
-
# Check if we just appended a table row and the next chunk looks like
|
| 579 |
-
# a table continuation (another table with similar column count)
|
| 580 |
-
if not _is_table_line(result[-1]):
|
| 581 |
-
continue
|
| 582 |
-
|
| 583 |
-
last_table_cols = _count_columns(result[-1])
|
| 584 |
-
if last_table_cols < 2:
|
| 585 |
-
continue
|
| 586 |
-
|
| 587 |
-
# Look ahead past empty lines / short non-table lines
|
| 588 |
-
j = i
|
| 589 |
-
gap_lines: list[str] = []
|
| 590 |
-
while j < len(lines):
|
| 591 |
-
s = lines[j].strip()
|
| 592 |
-
if s == "":
|
| 593 |
-
gap_lines.append(lines[j])
|
| 594 |
-
j += 1
|
| 595 |
-
continue
|
| 596 |
-
break
|
| 597 |
-
|
| 598 |
-
if j >= len(lines):
|
| 599 |
-
continue
|
| 600 |
-
|
| 601 |
-
# Check if the next non-empty line starts a table
|
| 602 |
-
if not _is_table_line(lines[j]):
|
| 603 |
-
continue
|
| 604 |
-
|
| 605 |
-
next_table_cols = _count_columns(lines[j])
|
| 606 |
-
|
| 607 |
-
# If column counts are close (within 30%), it's likely a continuation
|
| 608 |
-
if last_table_cols < 2 or next_table_cols < 2:
|
| 609 |
-
continue
|
| 610 |
-
ratio = min(last_table_cols, next_table_cols) / max(last_table_cols, next_table_cols)
|
| 611 |
-
if ratio < 0.7:
|
| 612 |
-
continue
|
| 613 |
-
|
| 614 |
-
# Check if the new table starts with header + separator (indicating
|
| 615 |
-
# the VLM re-extracted headers on the next page)
|
| 616 |
-
has_new_header = False
|
| 617 |
-
if _is_table_line(lines[j]):
|
| 618 |
-
# Look for a separator row in the next 1-2 lines
|
| 619 |
-
for k in range(j + 1, min(j + 3, len(lines))):
|
| 620 |
-
if _TABLE_SEP_ROW.match(lines[k].strip()):
|
| 621 |
-
has_new_header = True
|
| 622 |
-
break
|
| 623 |
-
|
| 624 |
-
if has_new_header:
|
| 625 |
-
# Skip the gap, skip the duplicate header + separator, keep data rows
|
| 626 |
-
# Find the separator row
|
| 627 |
-
skip_to = j
|
| 628 |
-
while skip_to < len(lines):
|
| 629 |
-
if _TABLE_SEP_ROW.match(lines[skip_to].strip()):
|
| 630 |
-
skip_to += 1 # Skip past separator
|
| 631 |
-
break
|
| 632 |
-
skip_to += 1
|
| 633 |
-
i = skip_to
|
| 634 |
-
else:
|
| 635 |
-
# No header β just skip the gap and append the continuation rows
|
| 636 |
-
i = j
|
| 637 |
-
|
| 638 |
-
return "\n".join(result)
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
|
| 642 |
-
"""Send a page image to Qwen3-VL via vLLM for text extraction.
|
| 643 |
-
|
| 644 |
-
Includes retry logic: on timeout/failure, retries once with longer timeout.
|
| 645 |
-
Strips <think> reasoning tokens from Qwen3 output.
|
| 646 |
-
"""
|
| 647 |
-
b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
|
| 648 |
-
|
| 649 |
-
payload = {
|
| 650 |
-
"model": VLM_MODEL,
|
| 651 |
-
"messages": [
|
| 652 |
-
{
|
| 653 |
-
"role": "user",
|
| 654 |
-
"content": [
|
| 655 |
-
{
|
| 656 |
-
"type": "image_url",
|
| 657 |
-
"image_url": {"url": f"data:image/png;base64,{b64_image}"},
|
| 658 |
-
},
|
| 659 |
-
{
|
| 660 |
-
"type": "text",
|
| 661 |
-
"text": (
|
| 662 |
-
"Convert this document page to markdown format.\n\n"
|
| 663 |
-
"Rules:\n"
|
| 664 |
-
"- Extract ALL text content exactly as written\n"
|
| 665 |
-
"- Use ## headings for section titles\n"
|
| 666 |
-
"- Preserve lists, paragraphs, and document structure\n"
|
| 667 |
-
"- For tables:\n"
|
| 668 |
-
" * Read EVERY column header exactly as printed β do NOT skip, rename, or reorder columns\n"
|
| 669 |
-
" * Include ALL columns even if the table is very wide\n"
|
| 670 |
-
" * Format as markdown tables with | delimiters and --- separator rows\n"
|
| 671 |
-
" * Each data row must have the same number of cells as the header\n"
|
| 672 |
-
" * NEVER use LaTeX (no \\begin{tabular}, no \\hline, no &)\n"
|
| 673 |
-
"- NEVER wrap output in code fences (no ```)\n"
|
| 674 |
-
"- NEVER add HTML comments or coordinate annotations\n"
|
| 675 |
-
"- Do NOT include page headers, footers, page numbers, or timestamps that repeat on every page\n"
|
| 676 |
-
"- For handwritten text, transcribe as accurately as possible\n"
|
| 677 |
-
"- Output ONLY the extracted markdown content, nothing else"
|
| 678 |
-
),
|
| 679 |
-
},
|
| 680 |
-
],
|
| 681 |
-
}
|
| 682 |
-
],
|
| 683 |
-
"max_tokens": 32768,
|
| 684 |
-
"temperature": 0.1,
|
| 685 |
-
# Disable Qwen3 thinking mode to avoid <think> tokens
|
| 686 |
-
"chat_template_kwargs": {"enable_thinking": False},
|
| 687 |
-
}
|
| 688 |
-
|
| 689 |
-
url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
|
| 690 |
-
|
| 691 |
-
# Try with primary timeout, then retry once with extended timeout
|
| 692 |
-
for attempt, timeout in enumerate([VLM_TIMEOUT, VLM_TIMEOUT * 1.5], start=1):
|
| 693 |
-
try:
|
| 694 |
-
response = httpx.post(url, json=payload, timeout=timeout)
|
| 695 |
-
if response.status_code != 200:
|
| 696 |
-
try:
|
| 697 |
-
err = response.json()
|
| 698 |
-
msg = err.get("message", err.get("detail", str(err)[:300]))
|
| 699 |
-
except Exception:
|
| 700 |
-
msg = response.text[:300]
|
| 701 |
-
logger.error(f"[{request_id}] vLLM error ({response.status_code}) page {page_no}: {msg}")
|
| 702 |
-
if attempt == 1:
|
| 703 |
-
logger.info(f"[{request_id}] Retrying page {page_no}...")
|
| 704 |
-
continue
|
| 705 |
-
response.raise_for_status()
|
| 706 |
-
|
| 707 |
-
result = response.json()
|
| 708 |
-
choices = result.get("choices")
|
| 709 |
-
if not choices:
|
| 710 |
-
raise ValueError("vLLM returned no choices")
|
| 711 |
-
content = choices[0].get("message", {}).get("content")
|
| 712 |
-
if content is None:
|
| 713 |
-
raise ValueError("vLLM response missing content")
|
| 714 |
-
|
| 715 |
-
# Clean VLM output (strip think blocks, code fences, HTML comments, convert LaTeX tables)
|
| 716 |
-
content = _clean_vlm_output(content)
|
| 717 |
-
|
| 718 |
-
return content
|
| 719 |
-
|
| 720 |
-
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 721 |
-
if attempt == 1:
|
| 722 |
-
logger.warning(
|
| 723 |
-
f"[{request_id}] VLM attempt {attempt} failed on page {page_no}: {e}. Retrying..."
|
| 724 |
-
)
|
| 725 |
-
continue
|
| 726 |
-
raise
|
| 727 |
-
|
| 728 |
-
raise RuntimeError(f"VLM failed after 2 attempts on page {page_no}")
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
def _vlm_extract_tables(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> Optional[str]:
|
| 732 |
-
"""Send a page image to VLM with a table-focused prompt for better table extraction.
|
| 733 |
-
|
| 734 |
-
Used as a second pass on pages where tables were detected in the first pass.
|
| 735 |
-
Returns extracted tables as markdown, or None on failure.
|
| 736 |
-
"""
|
| 737 |
-
b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
|
| 738 |
-
|
| 739 |
-
payload = {
|
| 740 |
-
"model": VLM_MODEL,
|
| 741 |
-
"messages": [
|
| 742 |
-
{
|
| 743 |
-
"role": "user",
|
| 744 |
-
"content": [
|
| 745 |
-
{
|
| 746 |
-
"type": "image_url",
|
| 747 |
-
"image_url": {"url": f"data:image/png;base64,{b64_image}"},
|
| 748 |
-
},
|
| 749 |
-
{
|
| 750 |
-
"type": "text",
|
| 751 |
-
"text": (
|
| 752 |
-
"Extract ONLY the tables from this document page as markdown.\n\n"
|
| 753 |
-
"Rules:\n"
|
| 754 |
-
"- Read every column header EXACTLY as printed on the page\n"
|
| 755 |
-
"- Include ALL columns β do NOT skip any, even if the table is very wide\n"
|
| 756 |
-
"- Each data row must have the same number of | cells as the header row\n"
|
| 757 |
-
"- Use | delimiters and --- separator rows\n"
|
| 758 |
-
"- Preserve all numbers, text, and formatting exactly\n"
|
| 759 |
-
"- Add spaces between words β never concatenate (e.g., 'CAP Rate' not 'CAPRate')\n"
|
| 760 |
-
"- If multiple tables exist, separate them with a blank line\n"
|
| 761 |
-
"- Include a short heading (## or ###) before each table if one is visible\n"
|
| 762 |
-
"- NEVER use LaTeX table syntax\n"
|
| 763 |
-
"- Output ONLY the markdown tables, nothing else"
|
| 764 |
-
),
|
| 765 |
-
},
|
| 766 |
-
],
|
| 767 |
-
}
|
| 768 |
-
],
|
| 769 |
-
"max_tokens": 32768,
|
| 770 |
-
"temperature": 0.1,
|
| 771 |
-
"chat_template_kwargs": {"enable_thinking": False},
|
| 772 |
-
}
|
| 773 |
-
|
| 774 |
-
url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
|
| 775 |
-
|
| 776 |
-
try:
|
| 777 |
-
response = httpx.post(url, json=payload, timeout=VLM_TIMEOUT)
|
| 778 |
-
if response.status_code != 200:
|
| 779 |
-
logger.warning(f"[{request_id}] Table re-prompt failed for page {page_no}: {response.status_code}")
|
| 780 |
-
return None
|
| 781 |
-
|
| 782 |
-
result = response.json()
|
| 783 |
-
choices = result.get("choices")
|
| 784 |
-
if not choices:
|
| 785 |
-
return None
|
| 786 |
-
content = choices[0].get("message", {}).get("content")
|
| 787 |
-
if content is None:
|
| 788 |
-
return None
|
| 789 |
-
|
| 790 |
-
content = _clean_vlm_output(content)
|
| 791 |
-
return content if content.strip() else None
|
| 792 |
-
|
| 793 |
-
except Exception as e:
|
| 794 |
-
logger.warning(f"[{request_id}] Table re-prompt error for page {page_no}: {e}")
|
| 795 |
-
return None
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
# ---------------------------------------------------------------------------
|
| 799 |
-
# Table Detection from VLM Output
|
| 800 |
-
# ---------------------------------------------------------------------------
|
| 801 |
-
|
| 802 |
-
# Markdown table separator: | --- | --- | or |:---:|---:|
|
| 803 |
-
_MD_TABLE_SEPARATOR = re.compile(
|
| 804 |
-
r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE
|
| 805 |
)
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
Checks for both markdown table separators and LaTeX tabular markers.
|
| 815 |
-
"""
|
| 816 |
-
table_pages: set[int] = set()
|
| 817 |
-
for page_no, text in vlm_page_texts.items():
|
| 818 |
-
if text and (
|
| 819 |
-
_MD_TABLE_SEPARATOR.search(text) or _LATEX_TABLE_PATTERN.search(text)
|
| 820 |
-
):
|
| 821 |
-
table_pages.add(page_no)
|
| 822 |
-
return table_pages
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
# ---------------------------------------------------------------------------
|
| 826 |
-
# Gemini API: Table Page Extraction
|
| 827 |
-
# ---------------------------------------------------------------------------
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
def _gemini_extract_page(
|
| 831 |
-
page_image_bytes: bytes, request_id: str = "", page_no: int = 0
|
| 832 |
-
) -> Optional[str]:
|
| 833 |
-
"""Send a page image to Gemini 2.5 Flash for high-quality extraction.
|
| 834 |
-
|
| 835 |
-
Used for table pages where VLM output is insufficient.
|
| 836 |
-
Returns the full page markdown (text + tables), or None on failure.
|
| 837 |
-
"""
|
| 838 |
-
if not GEMINI_API_KEY:
|
| 839 |
-
logger.warning(f"[{request_id}] GEMINI_API_KEY not set β skipping Gemini extraction")
|
| 840 |
-
return None
|
| 841 |
-
|
| 842 |
-
b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
|
| 843 |
-
|
| 844 |
-
payload = {
|
| 845 |
-
"contents": [
|
| 846 |
-
{
|
| 847 |
-
"parts": [
|
| 848 |
-
{
|
| 849 |
-
"inline_data": {
|
| 850 |
-
"mime_type": "image/png",
|
| 851 |
-
"data": b64_image,
|
| 852 |
-
}
|
| 853 |
-
},
|
| 854 |
-
{
|
| 855 |
-
"text": (
|
| 856 |
-
"Convert this document page to clean markdown format.\n\n"
|
| 857 |
-
"Rules:\n"
|
| 858 |
-
"- Extract ALL text content exactly as written\n"
|
| 859 |
-
"- Use ## headings for section titles\n"
|
| 860 |
-
"- Preserve lists, paragraphs, and document structure\n"
|
| 861 |
-
"- For tables:\n"
|
| 862 |
-
" * Read EVERY column header exactly as printed\n"
|
| 863 |
-
" * Include ALL columns even if the table is very wide\n"
|
| 864 |
-
" * Format as markdown tables with | delimiters and --- separator rows\n"
|
| 865 |
-
" * Each data row must have the same number of cells as the header\n"
|
| 866 |
-
" * Preserve multi-line cell content on separate lines within the cell\n"
|
| 867 |
-
"- Do NOT wrap output in code fences\n"
|
| 868 |
-
"- Do NOT add image descriptions or [Image:] tags\n"
|
| 869 |
-
"- Do NOT include page headers, footers, or page numbers\n"
|
| 870 |
-
"- Output ONLY the extracted markdown content"
|
| 871 |
-
),
|
| 872 |
-
},
|
| 873 |
-
],
|
| 874 |
-
}
|
| 875 |
-
],
|
| 876 |
-
"generationConfig": {
|
| 877 |
-
"temperature": 0.1,
|
| 878 |
-
"maxOutputTokens": 32768,
|
| 879 |
-
},
|
| 880 |
-
}
|
| 881 |
-
|
| 882 |
-
url = (
|
| 883 |
-
f"https://generativelanguage.googleapis.com/v1beta/models/"
|
| 884 |
-
f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
|
| 885 |
-
)
|
| 886 |
-
|
| 887 |
-
for attempt in range(1, 3):
|
| 888 |
-
try:
|
| 889 |
-
timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
|
| 890 |
-
response = httpx.post(url, json=payload, timeout=timeout)
|
| 891 |
-
|
| 892 |
-
if response.status_code == 429:
|
| 893 |
-
# Rate limited β wait and retry
|
| 894 |
-
logger.warning(
|
| 895 |
-
f"[{request_id}] Gemini rate limited on page {page_no + 1}, "
|
| 896 |
-
f"attempt {attempt}. Waiting 5s..."
|
| 897 |
-
)
|
| 898 |
-
time.sleep(5)
|
| 899 |
-
continue
|
| 900 |
-
|
| 901 |
-
if response.status_code != 200:
|
| 902 |
-
try:
|
| 903 |
-
err = response.json()
|
| 904 |
-
msg = str(err.get("error", {}).get("message", str(err)[:300]))
|
| 905 |
-
except Exception:
|
| 906 |
-
msg = response.text[:300]
|
| 907 |
-
logger.error(
|
| 908 |
-
f"[{request_id}] Gemini error ({response.status_code}) "
|
| 909 |
-
f"page {page_no + 1}: {msg}"
|
| 910 |
-
)
|
| 911 |
-
if attempt == 1:
|
| 912 |
-
continue
|
| 913 |
-
return None
|
| 914 |
-
|
| 915 |
-
result = response.json()
|
| 916 |
-
candidates = result.get("candidates", [])
|
| 917 |
-
if not candidates:
|
| 918 |
-
logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
|
| 919 |
-
return None
|
| 920 |
-
|
| 921 |
-
parts = candidates[0].get("content", {}).get("parts", [])
|
| 922 |
-
if not parts:
|
| 923 |
-
return None
|
| 924 |
-
|
| 925 |
-
content = parts[0].get("text", "")
|
| 926 |
-
|
| 927 |
-
# Clean up: strip code fences if Gemini wraps output
|
| 928 |
-
content = _CODE_FENCE_PATTERN.sub("", content)
|
| 929 |
-
content = _CODE_FENCE_END.sub("", content)
|
| 930 |
-
|
| 931 |
-
return content.strip() if content.strip() else None
|
| 932 |
-
|
| 933 |
-
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 934 |
-
if attempt == 1:
|
| 935 |
-
logger.warning(
|
| 936 |
-
f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}. Retrying..."
|
| 937 |
-
)
|
| 938 |
-
continue
|
| 939 |
-
logger.error(f"[{request_id}] Gemini failed after 2 attempts on page {page_no + 1}: {e}")
|
| 940 |
-
return None
|
| 941 |
-
|
| 942 |
-
return None
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
# ---------------------------------------------------------------------------
|
| 946 |
-
# Mini-PDF Extraction (pypdf) β kept for fallback Docling path
|
| 947 |
-
# ---------------------------------------------------------------------------
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
def _extract_pages_to_pdf(
|
| 951 |
-
input_path: Path, page_numbers: list[int], request_id: str
|
| 952 |
-
) -> tuple[Path, dict[int, int]]:
|
| 953 |
-
"""Extract specific pages from a PDF into a mini-PDF using pypdf.
|
| 954 |
-
|
| 955 |
-
Args:
|
| 956 |
-
input_path: Path to the original PDF
|
| 957 |
-
page_numbers: 0-indexed page numbers to extract
|
| 958 |
-
request_id: Request ID for logging
|
| 959 |
-
|
| 960 |
-
Returns:
|
| 961 |
-
(mini_pdf_path, page_map) where page_map maps Docling 1-indexed
|
| 962 |
-
page numbers in the mini-PDF back to 0-indexed original page numbers.
|
| 963 |
-
"""
|
| 964 |
-
from pypdf import PdfReader, PdfWriter
|
| 965 |
-
|
| 966 |
-
reader = PdfReader(str(input_path))
|
| 967 |
-
writer = PdfWriter()
|
| 968 |
-
|
| 969 |
-
# page_map: {docling_page_no (1-indexed in mini-PDF) β original_page_no (0-indexed)}
|
| 970 |
-
page_map: dict[int, int] = {}
|
| 971 |
-
|
| 972 |
-
for idx, orig_page in enumerate(sorted(page_numbers)):
|
| 973 |
-
if orig_page < len(reader.pages):
|
| 974 |
-
writer.add_page(reader.pages[orig_page])
|
| 975 |
-
page_map[idx + 1] = orig_page # Docling uses 1-indexed pages
|
| 976 |
-
else:
|
| 977 |
-
logger.warning(
|
| 978 |
-
f"[{request_id}] Page {orig_page} out of range (total: {len(reader.pages)})"
|
| 979 |
-
)
|
| 980 |
-
|
| 981 |
-
mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
|
| 982 |
-
with open(mini_pdf_path, "wb") as f:
|
| 983 |
-
writer.write(f)
|
| 984 |
-
|
| 985 |
-
logger.info(
|
| 986 |
-
f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original"
|
| 987 |
-
)
|
| 988 |
-
return mini_pdf_path, page_map
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
# ---------------------------------------------------------------------------
|
| 992 |
-
# Table Extraction from Docling
|
| 993 |
-
# ---------------------------------------------------------------------------
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
def _extract_table_markdowns(doc, page_map: dict[int, int]) -> dict[int, list[str]]:
|
| 997 |
-
"""Extract table markdown from Docling document, keyed by ORIGINAL page number.
|
| 998 |
-
|
| 999 |
-
Uses page_map to translate from Docling's 1-indexed mini-PDF pages
|
| 1000 |
-
back to the original 0-indexed page numbers.
|
| 1001 |
-
"""
|
| 1002 |
-
tables_by_page: dict[int, list[str]] = {}
|
| 1003 |
-
for element, _ in doc.iterate_items():
|
| 1004 |
-
if isinstance(element, TableItem):
|
| 1005 |
-
docling_page = element.prov[0].page_no if element.prov else -1
|
| 1006 |
-
# Translate mini-PDF page β original page
|
| 1007 |
-
orig_page = page_map.get(docling_page, docling_page - 1)
|
| 1008 |
-
table_md = element.export_to_markdown(doc=doc)
|
| 1009 |
-
if orig_page not in tables_by_page:
|
| 1010 |
-
tables_by_page[orig_page] = []
|
| 1011 |
-
tables_by_page[orig_page].append(table_md)
|
| 1012 |
-
return tables_by_page
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
def _extract_docling_page_markdown(doc, page_map: dict[int, int]) -> dict[int, str]:
|
| 1016 |
-
"""Extract complete per-page markdown from Docling document.
|
| 1017 |
-
|
| 1018 |
-
Returns dict mapping ORIGINAL page numbers (0-indexed) to complete markdown
|
| 1019 |
-
content including text, headings, and tables as Docling understands them.
|
| 1020 |
-
This is used as the PRIMARY output for table pages, replacing the VLM text
|
| 1021 |
-
entirely for better table structure.
|
| 1022 |
-
"""
|
| 1023 |
-
pages: dict[int, list[str]] = {}
|
| 1024 |
-
|
| 1025 |
-
for element, _ in doc.iterate_items():
|
| 1026 |
-
if not element.prov:
|
| 1027 |
-
continue
|
| 1028 |
-
docling_page = element.prov[0].page_no
|
| 1029 |
-
orig_page = page_map.get(docling_page, docling_page - 1)
|
| 1030 |
-
|
| 1031 |
-
md = element.export_to_markdown(doc=doc)
|
| 1032 |
-
if md and md.strip():
|
| 1033 |
-
if orig_page not in pages:
|
| 1034 |
-
pages[orig_page] = []
|
| 1035 |
-
pages[orig_page].append(md)
|
| 1036 |
-
|
| 1037 |
-
return {pg: "\n\n".join(parts) for pg, parts in pages.items()}
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
# ---------------------------------------------------------------------------
|
| 1041 |
-
# Merge: VLM Text + TableFormer Tables
|
| 1042 |
-
# ---------------------------------------------------------------------------
|
| 1043 |
-
|
| 1044 |
-
# Consecutive lines with | delimiters (markdown tables)
|
| 1045 |
-
_VLM_TABLE_BLOCK = re.compile(r"((?:^\|[^\n]+\|$\n?)+)", re.MULTILINE)
|
| 1046 |
-
|
| 1047 |
-
# LaTeX table blocks
|
| 1048 |
-
_VLM_LATEX_BLOCK = re.compile(
|
| 1049 |
-
r"(\\begin\{tabular\}.*?\\end\{tabular\})", re.DOTALL
|
| 1050 |
)
|
| 1051 |
|
| 1052 |
|
| 1053 |
-
def _extract_table_blocks(text: str) -> list[str]:
|
| 1054 |
-
"""Extract individual table blocks from markdown text.
|
| 1055 |
-
|
| 1056 |
-
Returns a list of table block strings (header + separator + data rows).
|
| 1057 |
-
"""
|
| 1058 |
-
tables: list[str] = []
|
| 1059 |
-
md_matches = list(_VLM_TABLE_BLOCK.finditer(text))
|
| 1060 |
-
latex_matches = list(_VLM_LATEX_BLOCK.finditer(text))
|
| 1061 |
-
|
| 1062 |
-
# Combine and deduplicate by position
|
| 1063 |
-
all_matches = [(m.start(), m.end(), m.group(0)) for m in md_matches]
|
| 1064 |
-
all_matches += [(m.start(), m.end(), m.group(0)) for m in latex_matches]
|
| 1065 |
-
all_matches.sort(key=lambda x: x[0])
|
| 1066 |
-
|
| 1067 |
-
last_end = -1
|
| 1068 |
-
for start, end, content in all_matches:
|
| 1069 |
-
if start >= last_end:
|
| 1070 |
-
tables.append(content.strip())
|
| 1071 |
-
last_end = end
|
| 1072 |
-
|
| 1073 |
-
return tables
|
| 1074 |
-
|
| 1075 |
-
|
| 1076 |
-
def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list[str]) -> str:
|
| 1077 |
-
"""Replace VLM's table sections with more accurate tables.
|
| 1078 |
-
|
| 1079 |
-
Handles both markdown pipe tables and LaTeX tabular blocks in VLM output.
|
| 1080 |
-
Used for both TableFormer tables (Pass 2) and re-prompted VLM tables (Pass 1.5).
|
| 1081 |
-
"""
|
| 1082 |
-
if not table_markdowns:
|
| 1083 |
-
return vlm_text
|
| 1084 |
-
|
| 1085 |
-
# Find all table blocks (markdown first, then LaTeX)
|
| 1086 |
-
md_tables = list(_VLM_TABLE_BLOCK.finditer(vlm_text))
|
| 1087 |
-
latex_tables = list(_VLM_LATEX_BLOCK.finditer(vlm_text))
|
| 1088 |
-
|
| 1089 |
-
# Combine and sort all table positions
|
| 1090 |
-
all_tables = [(m.start(), m.end(), "md") for m in md_tables]
|
| 1091 |
-
all_tables += [(m.start(), m.end(), "latex") for m in latex_tables]
|
| 1092 |
-
all_tables.sort(key=lambda x: x[0])
|
| 1093 |
-
|
| 1094 |
-
# Remove overlapping matches (prefer earlier match)
|
| 1095 |
-
filtered: list[tuple[int, int, str]] = []
|
| 1096 |
-
last_end = -1
|
| 1097 |
-
for start, end, kind in all_tables:
|
| 1098 |
-
if start >= last_end:
|
| 1099 |
-
filtered.append((start, end, kind))
|
| 1100 |
-
last_end = end
|
| 1101 |
-
|
| 1102 |
-
vlm_table_count = len(filtered)
|
| 1103 |
-
tf_table_count = len(table_markdowns)
|
| 1104 |
-
|
| 1105 |
-
if vlm_table_count != tf_table_count:
|
| 1106 |
-
logger.warning(
|
| 1107 |
-
f"Table count mismatch: VLM={vlm_table_count}, TableFormer={tf_table_count}. "
|
| 1108 |
-
f"Using positional replacement for min({vlm_table_count}, {tf_table_count}) tables."
|
| 1109 |
-
)
|
| 1110 |
-
|
| 1111 |
-
# Replace VLM tables with TableFormer tables (positional)
|
| 1112 |
-
result_parts: list[str] = []
|
| 1113 |
-
prev_end = 0
|
| 1114 |
-
table_idx = 0
|
| 1115 |
-
|
| 1116 |
-
for start, end, kind in filtered:
|
| 1117 |
-
result_parts.append(vlm_text[prev_end:start])
|
| 1118 |
-
if table_idx < tf_table_count:
|
| 1119 |
-
result_parts.append(table_markdowns[table_idx].strip() + "\n")
|
| 1120 |
-
table_idx += 1
|
| 1121 |
-
else:
|
| 1122 |
-
# More VLM tables than TableFormer β keep VLM version
|
| 1123 |
-
result_parts.append(vlm_text[start:end])
|
| 1124 |
-
prev_end = end
|
| 1125 |
-
|
| 1126 |
-
result_parts.append(vlm_text[prev_end:])
|
| 1127 |
-
|
| 1128 |
-
# If there are remaining TableFormer tables not matched, append them
|
| 1129 |
-
while table_idx < tf_table_count:
|
| 1130 |
-
result_parts.append("\n\n" + table_markdowns[table_idx].strip() + "\n")
|
| 1131 |
-
table_idx += 1
|
| 1132 |
-
|
| 1133 |
-
return "".join(result_parts)
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
# ---------------------------------------------------------------------------
|
| 1137 |
-
# PDF to Page Images (parallel, optimized)
|
| 1138 |
-
# ---------------------------------------------------------------------------
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
def _render_single_page(
|
| 1142 |
-
input_path: Path, page_idx: int, dpi: int
|
| 1143 |
-
) -> tuple[int, Optional[bytes]]:
|
| 1144 |
-
"""Render a single PDF page to PNG bytes with CLAHE preprocessing.
|
| 1145 |
-
|
| 1146 |
-
Returns (page_idx, png_bytes) or (page_idx, None) on failure.
|
| 1147 |
-
"""
|
| 1148 |
-
try:
|
| 1149 |
-
images = convert_from_path(
|
| 1150 |
-
str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
|
| 1151 |
-
)
|
| 1152 |
-
if not images:
|
| 1153 |
-
return page_idx, None
|
| 1154 |
-
|
| 1155 |
-
img = images[0]
|
| 1156 |
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
| 1157 |
-
tmp_path = tmp.name
|
| 1158 |
-
img.save(tmp_path, format="PNG")
|
| 1159 |
-
|
| 1160 |
-
try:
|
| 1161 |
-
_preprocess_image_for_ocr(tmp_path)
|
| 1162 |
-
with open(tmp_path, "rb") as f:
|
| 1163 |
-
return page_idx, f.read()
|
| 1164 |
-
finally:
|
| 1165 |
-
os.unlink(tmp_path)
|
| 1166 |
-
except Exception as e:
|
| 1167 |
-
logger.warning(f"Failed to render page {page_idx + 1}: {e}")
|
| 1168 |
-
return page_idx, None
|
| 1169 |
-
|
| 1170 |
-
|
| 1171 |
-
def _pdf_to_page_images(
|
| 1172 |
-
input_path: Path,
|
| 1173 |
-
request_id: str,
|
| 1174 |
-
start_page: int = 0,
|
| 1175 |
-
end_page: Optional[int] = None,
|
| 1176 |
-
) -> list[tuple[int, bytes]]:
|
| 1177 |
-
"""Convert PDF pages to PNG image bytes using parallel rendering.
|
| 1178 |
-
|
| 1179 |
-
Uses ThreadPoolExecutor for concurrent page rendering.
|
| 1180 |
-
Returns list of (page_no, png_bytes) tuples, sorted by page number.
|
| 1181 |
-
"""
|
| 1182 |
-
try:
|
| 1183 |
-
from pdf2image.pdf2image import pdfinfo_from_path
|
| 1184 |
-
|
| 1185 |
-
info = pdfinfo_from_path(str(input_path))
|
| 1186 |
-
total_pages = info["Pages"]
|
| 1187 |
-
last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
|
| 1188 |
-
except Exception as e:
|
| 1189 |
-
logger.warning(f"[{request_id}] Could not get PDF info: {e}")
|
| 1190 |
-
return []
|
| 1191 |
-
|
| 1192 |
-
page_indices = list(range(start_page, last_page))
|
| 1193 |
-
|
| 1194 |
-
start_time = time.time()
|
| 1195 |
-
page_images: list[tuple[int, bytes]] = []
|
| 1196 |
-
|
| 1197 |
-
# Render pages in parallel (4 threads β I/O bound, not CPU bound for poppler)
|
| 1198 |
-
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 1199 |
-
futures = {
|
| 1200 |
-
executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
|
| 1201 |
-
for idx in page_indices
|
| 1202 |
-
}
|
| 1203 |
-
for future in as_completed(futures):
|
| 1204 |
-
page_idx, png_bytes = future.result()
|
| 1205 |
-
if png_bytes is not None:
|
| 1206 |
-
page_images.append((page_idx, png_bytes))
|
| 1207 |
-
|
| 1208 |
-
page_images.sort(key=lambda x: x[0])
|
| 1209 |
-
render_time = time.time() - start_time
|
| 1210 |
-
logger.info(
|
| 1211 |
-
f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
|
| 1212 |
-
f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
|
| 1213 |
-
)
|
| 1214 |
-
return page_images
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
# ---------------------------------------------------------------------------
|
| 1218 |
-
# Docling Converter (for TableFormer only)
|
| 1219 |
-
# ---------------------------------------------------------------------------
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
def _create_converter(images_scale: float = 2.0) -> DocumentConverter:
|
| 1223 |
-
"""Create a Docling converter with Standard Pipeline.
|
| 1224 |
-
|
| 1225 |
-
Used ONLY for TableFormer on table pages (not for full document OCR).
|
| 1226 |
-
"""
|
| 1227 |
-
device = _get_device()
|
| 1228 |
-
logger.info(f"Creating converter with device: {device}")
|
| 1229 |
-
|
| 1230 |
-
pipeline_options = PdfPipelineOptions()
|
| 1231 |
-
pipeline_options.do_ocr = True
|
| 1232 |
-
pipeline_options.do_table_structure = True
|
| 1233 |
-
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
| 1234 |
-
pipeline_options.table_structure_options.do_cell_matching = True
|
| 1235 |
-
|
| 1236 |
-
pipeline_options.ocr_options = RapidOcrOptions()
|
| 1237 |
-
pipeline_options.ocr_options.force_full_page_ocr = True
|
| 1238 |
-
|
| 1239 |
-
pipeline_options.generate_page_images = True
|
| 1240 |
-
pipeline_options.images_scale = images_scale
|
| 1241 |
-
pipeline_options.generate_picture_images = True
|
| 1242 |
-
|
| 1243 |
-
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 1244 |
-
device=device,
|
| 1245 |
-
num_threads=0 if device == "cuda" else 4,
|
| 1246 |
-
)
|
| 1247 |
-
|
| 1248 |
-
converter = DocumentConverter(
|
| 1249 |
-
format_options={
|
| 1250 |
-
InputFormat.PDF: PdfFormatOption(
|
| 1251 |
-
pipeline_options=pipeline_options,
|
| 1252 |
-
backend=DoclingParseV4DocumentBackend,
|
| 1253 |
-
)
|
| 1254 |
-
}
|
| 1255 |
-
)
|
| 1256 |
-
return converter
|
| 1257 |
-
|
| 1258 |
-
|
| 1259 |
-
def _get_converter() -> DocumentConverter:
|
| 1260 |
-
"""Get or create the global converter instance."""
|
| 1261 |
-
global _converter
|
| 1262 |
-
if _converter is None:
|
| 1263 |
-
_converter = _create_converter(images_scale=IMAGES_SCALE)
|
| 1264 |
-
return _converter
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
# ---------------------------------------------------------------------------
|
| 1268 |
-
# VLM-First Conversion (Pass 1: VLM, Pass 2: TableFormer, Merge)
|
| 1269 |
-
# ---------------------------------------------------------------------------
|
| 1270 |
-
|
| 1271 |
-
|
| 1272 |
-
def _convert_document(
|
| 1273 |
-
input_path: Path,
|
| 1274 |
-
output_dir: Path,
|
| 1275 |
-
images_scale: float,
|
| 1276 |
-
include_images: bool,
|
| 1277 |
-
request_id: str,
|
| 1278 |
-
start_page: int = 0,
|
| 1279 |
-
end_page: Optional[int] = None,
|
| 1280 |
-
) -> tuple:
|
| 1281 |
-
"""
|
| 1282 |
-
VLM-first hybrid conversion.
|
| 1283 |
-
|
| 1284 |
-
Pass 1 (GPU): VLM OCR on ALL pages (fast, concurrent)
|
| 1285 |
-
Detect: Find table pages from VLM markdown output
|
| 1286 |
-
Pass 2 (CPU): Docling TableFormer ONLY on table pages (mini-PDF)
|
| 1287 |
-
Merge: VLM text for all pages + TableFormer tables
|
| 1288 |
-
|
| 1289 |
-
Returns: (markdown_content, json_content, pages_processed, image_count)
|
| 1290 |
-
"""
|
| 1291 |
-
overall_start = time.time()
|
| 1292 |
-
|
| 1293 |
-
# ---- RENDER ALL PAGES ----
|
| 1294 |
-
page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
|
| 1295 |
-
|
| 1296 |
-
if not page_images:
|
| 1297 |
-
logger.warning(f"[{request_id}] No page images β falling back to full Docling pipeline")
|
| 1298 |
-
return _convert_document_full_docling(
|
| 1299 |
-
input_path, output_dir, images_scale, include_images, request_id
|
| 1300 |
-
)
|
| 1301 |
-
|
| 1302 |
-
render_time = time.time() - overall_start
|
| 1303 |
-
|
| 1304 |
-
# ---- PASS 1: VLM OCR ALL PAGES (GPU, concurrent) ----
|
| 1305 |
-
logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
|
| 1306 |
-
logger.info(f"[{request_id}] Sending {len(page_images)} pages to VLM ({VLM_CONCURRENCY} concurrent)")
|
| 1307 |
-
|
| 1308 |
-
vlm_page_texts: dict[int, Optional[str]] = {}
|
| 1309 |
-
vlm_start = time.time()
|
| 1310 |
-
|
| 1311 |
-
with ThreadPoolExecutor(max_workers=VLM_CONCURRENCY) as executor:
|
| 1312 |
-
future_to_page = {
|
| 1313 |
-
executor.submit(_vlm_ocr_page, page_bytes, request_id, page_no + 1): page_no
|
| 1314 |
-
for page_no, page_bytes in page_images
|
| 1315 |
-
}
|
| 1316 |
-
for future in as_completed(future_to_page):
|
| 1317 |
-
page_no = future_to_page[future]
|
| 1318 |
-
try:
|
| 1319 |
-
vlm_text = future.result()
|
| 1320 |
-
vlm_page_texts[page_no] = vlm_text
|
| 1321 |
-
logger.info(
|
| 1322 |
-
f"[{request_id}] VLM processed page {page_no + 1} ({len(vlm_text)} chars)"
|
| 1323 |
-
)
|
| 1324 |
-
except Exception as e:
|
| 1325 |
-
logger.warning(f"[{request_id}] VLM failed on page {page_no + 1}: {e}")
|
| 1326 |
-
vlm_page_texts[page_no] = None
|
| 1327 |
-
|
| 1328 |
-
vlm_time = time.time() - vlm_start
|
| 1329 |
-
logger.info(f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)")
|
| 1330 |
-
|
| 1331 |
-
# ---- DETECT TABLE PAGES ----
|
| 1332 |
-
table_pages = _detect_table_pages(vlm_page_texts)
|
| 1333 |
-
|
| 1334 |
-
if table_pages:
|
| 1335 |
-
logger.info(
|
| 1336 |
-
f"[{request_id}] Tables detected on {len(table_pages)} pages: "
|
| 1337 |
-
f"{sorted(p + 1 for p in table_pages)}"
|
| 1338 |
-
)
|
| 1339 |
-
else:
|
| 1340 |
-
logger.info(f"[{request_id}] No tables detected β skipping table re-prompting")
|
| 1341 |
-
|
| 1342 |
-
# ---- PASS 2: GEMINI 2.5 FLASH ON TABLE PAGES ----
|
| 1343 |
-
gemini_page_texts: dict[int, str] = {}
|
| 1344 |
-
gemini_time = 0.0
|
| 1345 |
-
|
| 1346 |
-
if table_pages and GEMINI_API_KEY:
|
| 1347 |
-
logger.info(
|
| 1348 |
-
f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(table_pages)} table pages"
|
| 1349 |
-
)
|
| 1350 |
-
gemini_start = time.time()
|
| 1351 |
-
|
| 1352 |
-
# Build lookup: page_no β image bytes
|
| 1353 |
-
page_image_map = {pno: pbytes for pno, pbytes in page_images}
|
| 1354 |
-
|
| 1355 |
-
with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
|
| 1356 |
-
future_to_page = {
|
| 1357 |
-
executor.submit(
|
| 1358 |
-
_gemini_extract_page,
|
| 1359 |
-
page_image_map[page_no],
|
| 1360 |
-
request_id,
|
| 1361 |
-
page_no,
|
| 1362 |
-
): page_no
|
| 1363 |
-
for page_no in sorted(table_pages)
|
| 1364 |
-
if page_no in page_image_map
|
| 1365 |
-
}
|
| 1366 |
-
for future in as_completed(future_to_page):
|
| 1367 |
-
page_no = future_to_page[future]
|
| 1368 |
-
try:
|
| 1369 |
-
gemini_text = future.result()
|
| 1370 |
-
if gemini_text:
|
| 1371 |
-
gemini_page_texts[page_no] = gemini_text
|
| 1372 |
-
logger.info(
|
| 1373 |
-
f"[{request_id}] Gemini processed page {page_no + 1} "
|
| 1374 |
-
f"({len(gemini_text)} chars)"
|
| 1375 |
-
)
|
| 1376 |
-
else:
|
| 1377 |
-
logger.warning(
|
| 1378 |
-
f"[{request_id}] Gemini returned empty for page {page_no + 1} "
|
| 1379 |
-
f"β falling back to VLM"
|
| 1380 |
-
)
|
| 1381 |
-
except Exception as e:
|
| 1382 |
-
logger.warning(
|
| 1383 |
-
f"[{request_id}] Gemini failed on page {page_no + 1}: {e} "
|
| 1384 |
-
f"β falling back to VLM"
|
| 1385 |
-
)
|
| 1386 |
-
|
| 1387 |
-
gemini_time = time.time() - gemini_start
|
| 1388 |
-
logger.info(
|
| 1389 |
-
f"[{request_id}] Pass 2 completed in {gemini_time:.2f}s β "
|
| 1390 |
-
f"{len(gemini_page_texts)}/{len(table_pages)} table pages extracted via Gemini"
|
| 1391 |
-
)
|
| 1392 |
-
elif table_pages and not GEMINI_API_KEY:
|
| 1393 |
-
logger.warning(
|
| 1394 |
-
f"[{request_id}] GEMINI_API_KEY not set β table pages will use VLM output only"
|
| 1395 |
-
)
|
| 1396 |
-
|
| 1397 |
-
# ---- MERGE: VLM TEXT (non-table pages) + GEMINI (table pages) ----
|
| 1398 |
-
md_parts: list[str] = []
|
| 1399 |
-
image_count = 0
|
| 1400 |
-
|
| 1401 |
-
for page_no in sorted(vlm_page_texts.keys()):
|
| 1402 |
-
if md_parts:
|
| 1403 |
-
md_parts.append("\n\n")
|
| 1404 |
-
|
| 1405 |
-
if page_no in gemini_page_texts:
|
| 1406 |
-
# Table page β use Gemini's superior output
|
| 1407 |
-
md_parts.append(gemini_page_texts[page_no])
|
| 1408 |
-
elif vlm_page_texts[page_no] is not None:
|
| 1409 |
-
# Non-table page or Gemini fallback β use VLM output
|
| 1410 |
-
md_parts.append(vlm_page_texts[page_no])
|
| 1411 |
-
else:
|
| 1412 |
-
md_parts.append(f"[Page {page_no + 1}: extraction failed]\n\n")
|
| 1413 |
-
|
| 1414 |
-
markdown_content = "".join(md_parts)
|
| 1415 |
-
|
| 1416 |
-
# Post-process: fix cross-page artifacts, deduplicate headers, clean tables
|
| 1417 |
-
if len(vlm_page_texts) > 1:
|
| 1418 |
-
markdown_content = _post_process_merged_markdown(markdown_content)
|
| 1419 |
-
|
| 1420 |
-
pages_processed = len(vlm_page_texts)
|
| 1421 |
-
total_time = time.time() - overall_start
|
| 1422 |
-
|
| 1423 |
-
logger.info(
|
| 1424 |
-
f"[{request_id}] VLM+Gemini conversion complete: {pages_processed} pages β "
|
| 1425 |
-
f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
|
| 1426 |
-
f"Gemini {gemini_time:.1f}s = {total_time:.2f}s total"
|
| 1427 |
-
)
|
| 1428 |
-
if pages_processed > 0:
|
| 1429 |
-
logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
|
| 1430 |
-
|
| 1431 |
-
return markdown_content, None, pages_processed, image_count
|
| 1432 |
-
|
| 1433 |
-
|
| 1434 |
-
def _convert_document_full_docling(
|
| 1435 |
-
input_path: Path,
|
| 1436 |
-
output_dir: Path,
|
| 1437 |
-
images_scale: float,
|
| 1438 |
-
include_images: bool,
|
| 1439 |
-
request_id: str,
|
| 1440 |
-
) -> tuple:
|
| 1441 |
-
"""Fallback: full Docling pipeline when page images are unavailable."""
|
| 1442 |
-
logger.info(f"[{request_id}] Fallback: running full Docling pipeline")
|
| 1443 |
-
converter = _get_converter()
|
| 1444 |
-
|
| 1445 |
-
start_time = time.time()
|
| 1446 |
-
result = converter.convert(input_path)
|
| 1447 |
-
doc = result.document
|
| 1448 |
-
if doc is None:
|
| 1449 |
-
raise ValueError("Docling failed to parse document")
|
| 1450 |
-
|
| 1451 |
-
elapsed = time.time() - start_time
|
| 1452 |
-
logger.info(f"[{request_id}] Full Docling pipeline completed in {elapsed:.2f}s")
|
| 1453 |
-
|
| 1454 |
-
markdown_content = doc.export_to_markdown()
|
| 1455 |
-
pages_processed = len(
|
| 1456 |
-
set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
|
| 1457 |
-
)
|
| 1458 |
-
|
| 1459 |
-
image_count = 0
|
| 1460 |
-
if include_images:
|
| 1461 |
-
image_dir = output_dir / "images"
|
| 1462 |
-
image_dir.mkdir(parents=True, exist_ok=True)
|
| 1463 |
-
for element, _ in doc.iterate_items():
|
| 1464 |
-
if isinstance(element, PictureItem):
|
| 1465 |
-
if element.image and element.image.pil_image:
|
| 1466 |
-
pg = element.prov[0].page_no if element.prov else 0
|
| 1467 |
-
image_id = element.self_ref.split("/")[-1]
|
| 1468 |
-
image_name = f"page_{pg}_{image_id}.png"
|
| 1469 |
-
image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
|
| 1470 |
-
image_path = image_dir / image_name
|
| 1471 |
-
try:
|
| 1472 |
-
element.image.pil_image.save(image_path, format="PNG")
|
| 1473 |
-
image_count += 1
|
| 1474 |
-
except Exception:
|
| 1475 |
-
pass
|
| 1476 |
-
|
| 1477 |
-
return markdown_content, None, pages_processed, image_count
|
| 1478 |
-
|
| 1479 |
-
|
| 1480 |
-
# ---------------------------------------------------------------------------
|
| 1481 |
-
# Images Zip Helper
|
| 1482 |
-
# ---------------------------------------------------------------------------
|
| 1483 |
-
|
| 1484 |
-
|
| 1485 |
-
def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
| 1486 |
-
"""Create a zip file from extracted images."""
|
| 1487 |
-
image_dir = output_dir / "images"
|
| 1488 |
-
if not image_dir.exists():
|
| 1489 |
-
return None, 0
|
| 1490 |
-
|
| 1491 |
-
image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
| 1492 |
-
zip_buffer = io.BytesIO()
|
| 1493 |
-
image_count = 0
|
| 1494 |
-
|
| 1495 |
-
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 1496 |
-
for img_path in image_dir.glob("*"):
|
| 1497 |
-
if img_path.is_file() and img_path.suffix.lower() in image_extensions:
|
| 1498 |
-
try:
|
| 1499 |
-
zf.write(img_path, f"images/{img_path.name}")
|
| 1500 |
-
image_count += 1
|
| 1501 |
-
except Exception as e:
|
| 1502 |
-
logger.warning(f"Failed to add image {img_path} to zip: {e}")
|
| 1503 |
-
|
| 1504 |
-
if image_count == 0:
|
| 1505 |
-
return None, 0
|
| 1506 |
-
|
| 1507 |
-
return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
|
| 1508 |
-
|
| 1509 |
-
|
| 1510 |
# ---------------------------------------------------------------------------
|
| 1511 |
# Application Lifespan
|
| 1512 |
# ---------------------------------------------------------------------------
|
|
@@ -1514,23 +56,13 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
|
| 1514 |
|
| 1515 |
@asynccontextmanager
|
| 1516 |
async def lifespan(app: FastAPI):
|
| 1517 |
-
"""Startup: initialize
|
| 1518 |
logger.info("=" * 60)
|
| 1519 |
-
logger.info("Starting Docling VLM Parser API
|
| 1520 |
-
|
| 1521 |
-
|
| 1522 |
-
logger.info(
|
| 1523 |
-
|
| 1524 |
-
if device == "cuda":
|
| 1525 |
-
logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 1526 |
-
logger.info(f"CUDA Version: {torch.version.cuda}")
|
| 1527 |
-
logger.info(
|
| 1528 |
-
f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB"
|
| 1529 |
-
)
|
| 1530 |
|
| 1531 |
-
logger.info(f"VLM Model: {VLM_MODEL}")
|
| 1532 |
-
logger.info(f"VLM Endpoint: http://{VLM_HOST}:{VLM_PORT}")
|
| 1533 |
-
logger.info(f"VLM Timeout: {VLM_TIMEOUT}s, Concurrency: {VLM_CONCURRENCY}")
|
| 1534 |
logger.info(f"Render DPI: {RENDER_DPI}")
|
| 1535 |
logger.info(f"Images scale: {IMAGES_SCALE}")
|
| 1536 |
logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
|
|
@@ -1538,27 +70,8 @@ async def lifespan(app: FastAPI):
|
|
| 1538 |
logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
|
| 1539 |
logger.info(f"Gemini Timeout: {GEMINI_TIMEOUT}s, Concurrency: {GEMINI_CONCURRENCY}")
|
| 1540 |
|
| 1541 |
-
# Verify vLLM is running
|
| 1542 |
-
logger.info("Checking vLLM server...")
|
| 1543 |
-
try:
|
| 1544 |
-
async with httpx.AsyncClient(timeout=10) as client:
|
| 1545 |
-
resp = await client.get(f"http://{VLM_HOST}:{VLM_PORT}/health")
|
| 1546 |
-
resp.raise_for_status()
|
| 1547 |
-
logger.info("vLLM server is healthy")
|
| 1548 |
-
except Exception as e:
|
| 1549 |
-
logger.error(f"vLLM server not available: {e}")
|
| 1550 |
-
raise RuntimeError(f"vLLM server not available at {VLM_HOST}:{VLM_PORT}")
|
| 1551 |
-
|
| 1552 |
-
# Pre-initialize Docling converter
|
| 1553 |
-
logger.info("Pre-loading Docling models (DocLayNet + TableFormer + RapidOCR)...")
|
| 1554 |
-
try:
|
| 1555 |
-
_get_converter()
|
| 1556 |
-
logger.info("Docling models loaded successfully")
|
| 1557 |
-
except Exception as e:
|
| 1558 |
-
logger.warning(f"Failed to pre-load Docling models: {e}")
|
| 1559 |
-
|
| 1560 |
logger.info("=" * 60)
|
| 1561 |
-
logger.info("Docling VLM Parser API ready (
|
| 1562 |
logger.info("=" * 60)
|
| 1563 |
yield
|
| 1564 |
logger.info("Shutting down Docling VLM Parser API...")
|
|
@@ -1570,8 +83,8 @@ async def lifespan(app: FastAPI):
|
|
| 1570 |
|
| 1571 |
app = FastAPI(
|
| 1572 |
title="Docling VLM Parser API",
|
| 1573 |
-
description="
|
| 1574 |
-
version="
|
| 1575 |
lifespan=lifespan,
|
| 1576 |
)
|
| 1577 |
|
|
@@ -1584,23 +97,11 @@ app = FastAPI(
|
|
| 1584 |
@app.get("/", response_model=HealthResponse)
|
| 1585 |
async def health_check() -> HealthResponse:
|
| 1586 |
"""Health check endpoint."""
|
| 1587 |
-
device = _get_device()
|
| 1588 |
-
|
| 1589 |
-
vlm_status = "unknown"
|
| 1590 |
-
try:
|
| 1591 |
-
async with httpx.AsyncClient(timeout=5) as client:
|
| 1592 |
-
resp = await client.get(f"http://{VLM_HOST}:{VLM_PORT}/health")
|
| 1593 |
-
vlm_status = "healthy" if resp.status_code == 200 else "unhealthy"
|
| 1594 |
-
except Exception:
|
| 1595 |
-
vlm_status = "unreachable"
|
| 1596 |
-
|
| 1597 |
return HealthResponse(
|
| 1598 |
status="healthy",
|
| 1599 |
-
version="
|
| 1600 |
-
|
| 1601 |
-
|
| 1602 |
-
vlm_model=f"active (gemini: {'configured' if GEMINI_API_KEY else 'not set'})",
|
| 1603 |
-
vlm_status=vlm_status,
|
| 1604 |
images_scale=IMAGES_SCALE,
|
| 1605 |
)
|
| 1606 |
|
|
@@ -1615,7 +116,7 @@ async def parse_document(
|
|
| 1615 |
include_images: bool = Form(default=False, description="Include extracted images"),
|
| 1616 |
_token: str = Depends(verify_token),
|
| 1617 |
) -> ParseResponse:
|
| 1618 |
-
"""Parse a document file using
|
| 1619 |
request_id = str(uuid4())[:8]
|
| 1620 |
start_time = time.time()
|
| 1621 |
|
|
@@ -1654,9 +155,7 @@ async def parse_document(
|
|
| 1654 |
detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
|
| 1655 |
)
|
| 1656 |
|
| 1657 |
-
|
| 1658 |
-
|
| 1659 |
-
logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
|
| 1660 |
logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
|
| 1661 |
|
| 1662 |
temp_dir = tempfile.mkdtemp()
|
|
@@ -1672,7 +171,6 @@ async def parse_document(
|
|
| 1672 |
_convert_document,
|
| 1673 |
input_path,
|
| 1674 |
output_dir,
|
| 1675 |
-
use_images_scale,
|
| 1676 |
include_images,
|
| 1677 |
request_id,
|
| 1678 |
start_page,
|
|
@@ -1699,8 +197,8 @@ async def parse_document(
|
|
| 1699 |
images_zip=images_zip,
|
| 1700 |
image_count=image_count,
|
| 1701 |
pages_processed=pages_processed,
|
| 1702 |
-
device_used=
|
| 1703 |
-
vlm_model=
|
| 1704 |
)
|
| 1705 |
|
| 1706 |
except Exception as e:
|
|
@@ -1722,7 +220,7 @@ async def parse_document_from_url(
|
|
| 1722 |
request: URLParseRequest,
|
| 1723 |
_token: str = Depends(verify_token),
|
| 1724 |
) -> ParseResponse:
|
| 1725 |
-
"""Parse a document from a URL using
|
| 1726 |
request_id = str(uuid4())[:8]
|
| 1727 |
start_time = time.time()
|
| 1728 |
|
|
@@ -1782,9 +280,7 @@ async def parse_document_from_url(
|
|
| 1782 |
output_dir = Path(temp_dir) / "output"
|
| 1783 |
output_dir.mkdir(exist_ok=True)
|
| 1784 |
|
| 1785 |
-
|
| 1786 |
-
|
| 1787 |
-
logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
|
| 1788 |
logger.info(
|
| 1789 |
f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
|
| 1790 |
)
|
|
@@ -1793,7 +289,6 @@ async def parse_document_from_url(
|
|
| 1793 |
_convert_document,
|
| 1794 |
input_path,
|
| 1795 |
output_dir,
|
| 1796 |
-
use_images_scale,
|
| 1797 |
request.include_images,
|
| 1798 |
request_id,
|
| 1799 |
request.start_page,
|
|
@@ -1820,8 +315,8 @@ async def parse_document_from_url(
|
|
| 1820 |
images_zip=images_zip,
|
| 1821 |
image_count=image_count,
|
| 1822 |
pages_processed=pages_processed,
|
| 1823 |
-
device_used=
|
| 1824 |
-
vlm_model=
|
| 1825 |
)
|
| 1826 |
|
| 1827 |
except httpx.HTTPError as e:
|
|
|
|
| 1 |
"""
|
| 2 |
+
Docling VLM Parser API v5.0.0
|
| 3 |
+
|
| 4 |
+
A FastAPI service using a PaddleOCR-VL-1.5 + Gemini hybrid architecture for document parsing:
|
| 5 |
+
Pass 1 (GPU): PaddleOCR-VL-1.5 on full PDF (native document parsing, 0.9B params)
|
| 6 |
+
Pass 2 (API): Gemini 3 Flash on table pages only (highest quality tables)
|
| 7 |
+
Post: Cross-page artifact removal, table cleanup, deduplication, footer removal
|
| 8 |
+
|
| 9 |
+
v5.0.0 β PaddleOCR-VL-1.5 + Gemini hybrid:
|
| 10 |
+
- Core: PaddleOCR-VL-1.5 replaces Qwen3-VL + Docling entirely
|
| 11 |
+
- Quality: Gemini 3 Flash used ONLY for pages with tables (better table accuracy)
|
| 12 |
+
- Speed: PaddleOCR handles PDF natively β no separate image rendering for OCR
|
| 13 |
+
- GPU: Runs on T4 (16GB VRAM) β much smaller than A100 requirement
|
| 14 |
+
- Quality: Enhanced post-processing β aggressive footer/artifact removal
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
import asyncio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
import re
|
|
|
|
| 19 |
import shutil
|
|
|
|
| 20 |
import tempfile
|
| 21 |
import time
|
|
|
|
|
|
|
| 22 |
from contextlib import asynccontextmanager
|
| 23 |
from pathlib import Path
|
| 24 |
+
from typing import Optional
|
|
|
|
| 25 |
from uuid import uuid4
|
| 26 |
|
|
|
|
| 27 |
import httpx
|
|
|
|
| 28 |
from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
from auth import _validate_url, verify_token
|
| 31 |
+
from config import (
|
| 32 |
+
GEMINI_API_KEY,
|
| 33 |
+
GEMINI_CONCURRENCY,
|
| 34 |
+
GEMINI_MODEL,
|
| 35 |
+
GEMINI_TIMEOUT,
|
| 36 |
+
IMAGES_SCALE,
|
| 37 |
+
MAX_FILE_SIZE_BYTES,
|
| 38 |
+
MAX_FILE_SIZE_MB,
|
| 39 |
+
RENDER_DPI,
|
| 40 |
+
logger,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
)
|
| 42 |
+
from models import HealthResponse, ParseResponse, URLParseRequest
|
| 43 |
+
from pipeline import (
|
| 44 |
+
_convert_document,
|
| 45 |
+
_create_images_zip,
|
| 46 |
+
_get_pipeline,
|
| 47 |
+
_save_downloaded_content,
|
| 48 |
+
_save_uploaded_file,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
)
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
# ---------------------------------------------------------------------------
|
| 53 |
# Application Lifespan
|
| 54 |
# ---------------------------------------------------------------------------
|
|
|
|
| 56 |
|
| 57 |
@asynccontextmanager
|
| 58 |
async def lifespan(app: FastAPI):
|
| 59 |
+
"""Startup: initialize PaddleOCR-VL-1.5 pipeline."""
|
| 60 |
logger.info("=" * 60)
|
| 61 |
+
logger.info("Starting Docling VLM Parser API v5.0.0...")
|
| 62 |
+
logger.info("Initializing PaddleOCR-VL-1.5 pipeline...")
|
| 63 |
+
_get_pipeline()
|
| 64 |
+
logger.info("PaddleOCR-VL-1.5 ready")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
logger.info(f"Render DPI: {RENDER_DPI}")
|
| 67 |
logger.info(f"Images scale: {IMAGES_SCALE}")
|
| 68 |
logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
|
|
|
|
| 70 |
logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
|
| 71 |
logger.info(f"Gemini Timeout: {GEMINI_TIMEOUT}s, Concurrency: {GEMINI_CONCURRENCY}")
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
logger.info("=" * 60)
|
| 74 |
+
logger.info("Docling VLM Parser API ready (PaddleOCR-VL-1.5 + Gemini hybrid)")
|
| 75 |
logger.info("=" * 60)
|
| 76 |
yield
|
| 77 |
logger.info("Shutting down Docling VLM Parser API...")
|
|
|
|
| 83 |
|
| 84 |
app = FastAPI(
|
| 85 |
title="Docling VLM Parser API",
|
| 86 |
+
description="PaddleOCR-VL-1.5 + Gemini 3 Flash hybrid parser",
|
| 87 |
+
version="5.0.0",
|
| 88 |
lifespan=lifespan,
|
| 89 |
)
|
| 90 |
|
|
|
|
| 97 |
@app.get("/", response_model=HealthResponse)
|
| 98 |
async def health_check() -> HealthResponse:
|
| 99 |
"""Health check endpoint."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
return HealthResponse(
|
| 101 |
status="healthy",
|
| 102 |
+
version="5.0.0",
|
| 103 |
+
model="PaddleOCR-VL-1.5",
|
| 104 |
+
gemini_status="configured" if GEMINI_API_KEY else "not set",
|
|
|
|
|
|
|
| 105 |
images_scale=IMAGES_SCALE,
|
| 106 |
)
|
| 107 |
|
|
|
|
| 116 |
include_images: bool = Form(default=False, description="Include extracted images"),
|
| 117 |
_token: str = Depends(verify_token),
|
| 118 |
) -> ParseResponse:
|
| 119 |
+
"""Parse a document file using PaddleOCR-VL-1.5 + Gemini hybrid pipeline."""
|
| 120 |
request_id = str(uuid4())[:8]
|
| 121 |
start_time = time.time()
|
| 122 |
|
|
|
|
| 155 |
detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
|
| 156 |
)
|
| 157 |
|
| 158 |
+
logger.info(f"[{request_id}] Model: PaddleOCR-VL-1.5")
|
|
|
|
|
|
|
| 159 |
logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
|
| 160 |
|
| 161 |
temp_dir = tempfile.mkdtemp()
|
|
|
|
| 171 |
_convert_document,
|
| 172 |
input_path,
|
| 173 |
output_dir,
|
|
|
|
| 174 |
include_images,
|
| 175 |
request_id,
|
| 176 |
start_page,
|
|
|
|
| 197 |
images_zip=images_zip,
|
| 198 |
image_count=image_count,
|
| 199 |
pages_processed=pages_processed,
|
| 200 |
+
device_used="gpu",
|
| 201 |
+
vlm_model="PaddleOCR-VL-1.5",
|
| 202 |
)
|
| 203 |
|
| 204 |
except Exception as e:
|
|
|
|
| 220 |
request: URLParseRequest,
|
| 221 |
_token: str = Depends(verify_token),
|
| 222 |
) -> ParseResponse:
|
| 223 |
+
"""Parse a document from a URL using PaddleOCR-VL-1.5 + Gemini hybrid pipeline."""
|
| 224 |
request_id = str(uuid4())[:8]
|
| 225 |
start_time = time.time()
|
| 226 |
|
|
|
|
| 280 |
output_dir = Path(temp_dir) / "output"
|
| 281 |
output_dir.mkdir(exist_ok=True)
|
| 282 |
|
| 283 |
+
logger.info(f"[{request_id}] Model: PaddleOCR-VL-1.5")
|
|
|
|
|
|
|
| 284 |
logger.info(
|
| 285 |
f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
|
| 286 |
)
|
|
|
|
| 289 |
_convert_document,
|
| 290 |
input_path,
|
| 291 |
output_dir,
|
|
|
|
| 292 |
request.include_images,
|
| 293 |
request_id,
|
| 294 |
request.start_page,
|
|
|
|
| 315 |
images_zip=images_zip,
|
| 316 |
image_count=image_count,
|
| 317 |
pages_processed=pages_processed,
|
| 318 |
+
device_used="gpu",
|
| 319 |
+
vlm_model="PaddleOCR-VL-1.5",
|
| 320 |
)
|
| 321 |
|
| 322 |
except httpx.HTTPError as e:
|
auth.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bearer token authentication and URL validation (SSRF protection)."""
|
| 2 |
+
|
| 3 |
+
import ipaddress
|
| 4 |
+
import secrets
|
| 5 |
+
import socket
|
| 6 |
+
from urllib.parse import urlparse
|
| 7 |
+
|
| 8 |
+
from fastapi import Depends, HTTPException
|
| 9 |
+
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
| 10 |
+
|
| 11 |
+
from config import API_TOKEN, BLOCKED_HOSTNAMES
|
| 12 |
+
|
| 13 |
+
security = HTTPBearer()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
|
| 17 |
+
"""Verify the API token from Authorization header."""
|
| 18 |
+
if not API_TOKEN:
|
| 19 |
+
raise HTTPException(
|
| 20 |
+
status_code=500,
|
| 21 |
+
detail="No API token configured on server",
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
token = credentials.credentials
|
| 25 |
+
if not secrets.compare_digest(token, API_TOKEN):
|
| 26 |
+
raise HTTPException(
|
| 27 |
+
status_code=401,
|
| 28 |
+
detail="Invalid API token",
|
| 29 |
+
)
|
| 30 |
+
return token
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _validate_url(url: str) -> None:
|
| 34 |
+
"""Validate URL to prevent SSRF attacks."""
|
| 35 |
+
try:
|
| 36 |
+
parsed = urlparse(url)
|
| 37 |
+
except Exception as e:
|
| 38 |
+
raise HTTPException(
|
| 39 |
+
status_code=400,
|
| 40 |
+
detail=f"Invalid URL format: {str(e)}",
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
if parsed.scheme not in ("http", "https"):
|
| 44 |
+
raise HTTPException(
|
| 45 |
+
status_code=400,
|
| 46 |
+
detail=f"Invalid URL scheme '{parsed.scheme}'. Only http and https are allowed.",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
hostname = parsed.hostname
|
| 50 |
+
if not hostname:
|
| 51 |
+
raise HTTPException(
|
| 52 |
+
status_code=400,
|
| 53 |
+
detail="Invalid URL: missing hostname.",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
hostname_lower = hostname.lower()
|
| 57 |
+
if hostname_lower in BLOCKED_HOSTNAMES:
|
| 58 |
+
raise HTTPException(
|
| 59 |
+
status_code=400,
|
| 60 |
+
detail="Access to internal/metadata services is not allowed.",
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
blocked_patterns = ["metadata", "internal", "localhost", "127.0.0.1", "::1"]
|
| 64 |
+
for pattern in blocked_patterns:
|
| 65 |
+
if pattern in hostname_lower:
|
| 66 |
+
raise HTTPException(
|
| 67 |
+
status_code=400,
|
| 68 |
+
detail="Access to internal/metadata services is not allowed.",
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
ip_str = socket.gethostbyname(hostname)
|
| 73 |
+
ip = ipaddress.ip_address(ip_str)
|
| 74 |
+
except socket.gaierror:
|
| 75 |
+
raise HTTPException(
|
| 76 |
+
status_code=400,
|
| 77 |
+
detail=f"Could not resolve hostname: {hostname}",
|
| 78 |
+
)
|
| 79 |
+
except ValueError as e:
|
| 80 |
+
raise HTTPException(
|
| 81 |
+
status_code=400,
|
| 82 |
+
detail=f"Invalid IP address resolved: {str(e)}",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
|
| 86 |
+
raise HTTPException(
|
| 87 |
+
status_code=400,
|
| 88 |
+
detail="Access to private/internal IP addresses is not allowed.",
|
| 89 |
+
)
|
config.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration, environment variables, and logging setup for the Docling VLM Parser."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Configure logging
|
| 7 |
+
logging.basicConfig(
|
| 8 |
+
level=logging.INFO,
|
| 9 |
+
format="%(asctime)s | %(levelname)-8s | %(message)s",
|
| 10 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 11 |
+
)
|
| 12 |
+
logger = logging.getLogger("docling-parser")
|
| 13 |
+
|
| 14 |
+
# Security
|
| 15 |
+
API_TOKEN = os.getenv("API_TOKEN")
|
| 16 |
+
|
| 17 |
+
# Configuration
|
| 18 |
+
IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
|
| 19 |
+
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
|
| 20 |
+
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
|
| 21 |
+
RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
|
| 22 |
+
|
| 23 |
+
# Gemini API Configuration (table page enhancement)
|
| 24 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
|
| 25 |
+
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
|
| 26 |
+
GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
|
| 27 |
+
GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8"))
|
| 28 |
+
|
| 29 |
+
# Blocked hostnames for SSRF protection
|
| 30 |
+
BLOCKED_HOSTNAMES = {
|
| 31 |
+
"localhost",
|
| 32 |
+
"metadata",
|
| 33 |
+
"metadata.google.internal",
|
| 34 |
+
"metadata.google",
|
| 35 |
+
"169.254.169.254",
|
| 36 |
+
"fd00:ec2::254",
|
| 37 |
+
}
|
gemini.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gemini API extraction function for table page enhancement."""
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
import httpx
|
| 9 |
+
|
| 10 |
+
from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger
|
| 11 |
+
|
| 12 |
+
# Strip code fence wrappers (Gemini sometimes wraps output)
|
| 13 |
+
_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
|
| 14 |
+
_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _gemini_extract_page(
|
| 18 |
+
page_image_bytes: bytes, request_id: str = "", page_no: int = 0
|
| 19 |
+
) -> Optional[str]:
|
| 20 |
+
"""Send a page image to Gemini 2.5 Flash for high-quality extraction.
|
| 21 |
+
|
| 22 |
+
Used for table pages where PaddleOCR output is insufficient.
|
| 23 |
+
Returns the full page markdown (text + tables), or None on failure.
|
| 24 |
+
"""
|
| 25 |
+
if not GEMINI_API_KEY:
|
| 26 |
+
logger.warning(f"[{request_id}] GEMINI_API_KEY not set β skipping Gemini extraction")
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
|
| 30 |
+
|
| 31 |
+
payload = {
|
| 32 |
+
"contents": [
|
| 33 |
+
{
|
| 34 |
+
"parts": [
|
| 35 |
+
{
|
| 36 |
+
"inline_data": {
|
| 37 |
+
"mime_type": "image/png",
|
| 38 |
+
"data": b64_image,
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"text": (
|
| 43 |
+
"Convert this document page to clean markdown format.\n\n"
|
| 44 |
+
"Rules:\n"
|
| 45 |
+
"- Extract ALL text content exactly as written β do not paraphrase or summarize\n"
|
| 46 |
+
"- Use ## for main section headings and ### for subsection headings\n"
|
| 47 |
+
"- Preserve lists, paragraphs, bullet points, and document structure\n"
|
| 48 |
+
"- For tables:\n"
|
| 49 |
+
" * Read EVERY column header exactly as printed on the page\n"
|
| 50 |
+
" * Include ALL columns even if the table is very wide\n"
|
| 51 |
+
" * Format as markdown tables with | delimiters and --- separator rows\n"
|
| 52 |
+
" * Each data row MUST have the same number of | cells as the header row\n"
|
| 53 |
+
" * Preserve multi-line cell content β use <br> for line breaks within cells\n"
|
| 54 |
+
" * For financial/lease tables, preserve ALL numbers, dates, and terms exactly\n"
|
| 55 |
+
" * Add spaces between words β never concatenate (e.g., 'CAP Rate' not 'CAPRate')\n"
|
| 56 |
+
"- Do NOT wrap output in code fences (no ```)\n"
|
| 57 |
+
"- Do NOT add image descriptions, [Image:] tags, or describe visual elements\n"
|
| 58 |
+
"- Do NOT include page headers, footers, page numbers, or repeated branding\n"
|
| 59 |
+
"- Do NOT extract text from map images or photographs\n"
|
| 60 |
+
"- Output ONLY the extracted markdown content, nothing else"
|
| 61 |
+
),
|
| 62 |
+
},
|
| 63 |
+
],
|
| 64 |
+
}
|
| 65 |
+
],
|
| 66 |
+
"generationConfig": {
|
| 67 |
+
"temperature": 0.1,
|
| 68 |
+
"maxOutputTokens": 32768,
|
| 69 |
+
},
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
url = (
|
| 73 |
+
f"https://generativelanguage.googleapis.com/v1beta/models/"
|
| 74 |
+
f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
for attempt in range(1, 3):
|
| 78 |
+
try:
|
| 79 |
+
timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
|
| 80 |
+
response = httpx.post(url, json=payload, timeout=timeout)
|
| 81 |
+
|
| 82 |
+
if response.status_code == 429:
|
| 83 |
+
# Rate limited β wait and retry
|
| 84 |
+
logger.warning(
|
| 85 |
+
f"[{request_id}] Gemini rate limited on page {page_no + 1}, "
|
| 86 |
+
f"attempt {attempt}. Waiting 5s..."
|
| 87 |
+
)
|
| 88 |
+
time.sleep(5)
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
if response.status_code != 200:
|
| 92 |
+
try:
|
| 93 |
+
err = response.json()
|
| 94 |
+
msg = str(err.get("error", {}).get("message", str(err)[:300]))
|
| 95 |
+
except Exception:
|
| 96 |
+
msg = response.text[:300]
|
| 97 |
+
logger.error(
|
| 98 |
+
f"[{request_id}] Gemini error ({response.status_code}) "
|
| 99 |
+
f"page {page_no + 1}: {msg}"
|
| 100 |
+
)
|
| 101 |
+
if attempt == 1:
|
| 102 |
+
continue
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
result = response.json()
|
| 106 |
+
candidates = result.get("candidates", [])
|
| 107 |
+
if not candidates:
|
| 108 |
+
logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
parts = candidates[0].get("content", {}).get("parts", [])
|
| 112 |
+
if not parts:
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
content = parts[0].get("text", "")
|
| 116 |
+
|
| 117 |
+
# Clean up: strip code fences if Gemini wraps output
|
| 118 |
+
content = _CODE_FENCE_PATTERN.sub("", content)
|
| 119 |
+
content = _CODE_FENCE_END.sub("", content)
|
| 120 |
+
|
| 121 |
+
return content.strip() if content.strip() else None
|
| 122 |
+
|
| 123 |
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 124 |
+
if attempt == 1:
|
| 125 |
+
logger.warning(
|
| 126 |
+
f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}. Retrying..."
|
| 127 |
+
)
|
| 128 |
+
continue
|
| 129 |
+
logger.error(f"[{request_id}] Gemini failed after 2 attempts on page {page_no + 1}: {e}")
|
| 130 |
+
return None
|
| 131 |
+
|
| 132 |
+
return None
|
models.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic models for API request/response schemas."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional, Union
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ParseResponse(BaseModel):
|
| 9 |
+
"""Response model for document parsing."""
|
| 10 |
+
|
| 11 |
+
success: bool
|
| 12 |
+
markdown: Optional[str] = None
|
| 13 |
+
json_content: Optional[Union[dict, list]] = None
|
| 14 |
+
images_zip: Optional[str] = None
|
| 15 |
+
image_count: int = 0
|
| 16 |
+
error: Optional[str] = None
|
| 17 |
+
pages_processed: int = 0
|
| 18 |
+
device_used: Optional[str] = None
|
| 19 |
+
vlm_model: Optional[str] = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class HealthResponse(BaseModel):
|
| 23 |
+
"""Health check response."""
|
| 24 |
+
|
| 25 |
+
status: str
|
| 26 |
+
version: str
|
| 27 |
+
model: str
|
| 28 |
+
gemini_status: str = "unknown"
|
| 29 |
+
images_scale: float = 2.0
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class URLParseRequest(BaseModel):
|
| 33 |
+
"""Request model for URL-based parsing."""
|
| 34 |
+
|
| 35 |
+
url: str
|
| 36 |
+
output_format: str = "markdown"
|
| 37 |
+
images_scale: Optional[float] = None
|
| 38 |
+
start_page: int = 0
|
| 39 |
+
end_page: Optional[int] = None
|
| 40 |
+
include_images: bool = False
|
pipeline.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PaddleOCR-VL pipeline, hybrid conversion logic, and file helpers."""
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
import re
|
| 6 |
+
import shutil
|
| 7 |
+
import time
|
| 8 |
+
import zipfile
|
| 9 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import BinaryIO, Optional
|
| 12 |
+
|
| 13 |
+
from paddleocr import PaddleOCRVL
|
| 14 |
+
|
| 15 |
+
from config import GEMINI_API_KEY, GEMINI_CONCURRENCY, GEMINI_MODEL, logger
|
| 16 |
+
from gemini import _gemini_extract_page
|
| 17 |
+
from postprocess import _post_process_merged_markdown
|
| 18 |
+
from rendering import _pdf_to_page_images
|
| 19 |
+
|
| 20 |
+
# Global PaddleOCR-VL pipeline instance
|
| 21 |
+
_pipeline = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _get_pipeline():
|
| 25 |
+
"""Get or create the global PaddleOCR-VL-1.5 pipeline instance."""
|
| 26 |
+
global _pipeline
|
| 27 |
+
if _pipeline is None:
|
| 28 |
+
_pipeline = PaddleOCRVL()
|
| 29 |
+
return _pipeline
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _page_has_tables(result) -> bool:
|
| 33 |
+
"""Check if PaddleOCR result contains table elements from layout analysis.
|
| 34 |
+
|
| 35 |
+
Uses layout detection labels and falls back to markdown pattern matching.
|
| 36 |
+
"""
|
| 37 |
+
try:
|
| 38 |
+
# Try accessing layout detection results
|
| 39 |
+
if hasattr(result, 'json') and result.json:
|
| 40 |
+
json_data = result.json
|
| 41 |
+
if isinstance(json_data, dict):
|
| 42 |
+
for block in json_data.get('layout_det', []):
|
| 43 |
+
if block.get('label', '').lower() == 'table':
|
| 44 |
+
return True
|
| 45 |
+
# Fallback: check markdown content for table patterns
|
| 46 |
+
md = result.markdown
|
| 47 |
+
if isinstance(md, dict):
|
| 48 |
+
md_text = md.get('markdown_texts', '')
|
| 49 |
+
else:
|
| 50 |
+
md_text = str(md)
|
| 51 |
+
return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
|
| 52 |
+
except Exception:
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
|
| 57 |
+
"""Sync helper to save uploaded file to disk."""
|
| 58 |
+
with open(input_path, "wb") as f:
|
| 59 |
+
shutil.copyfileobj(file_obj, f)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _save_downloaded_content(input_path: Path, content: bytes) -> None:
|
| 63 |
+
"""Sync helper to save downloaded content to disk."""
|
| 64 |
+
with open(input_path, "wb") as f:
|
| 65 |
+
f.write(content)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
| 69 |
+
"""Create a zip file from extracted images."""
|
| 70 |
+
image_dir = output_dir / "images"
|
| 71 |
+
if not image_dir.exists():
|
| 72 |
+
return None, 0
|
| 73 |
+
|
| 74 |
+
image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
|
| 75 |
+
zip_buffer = io.BytesIO()
|
| 76 |
+
image_count = 0
|
| 77 |
+
|
| 78 |
+
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 79 |
+
for img_path in image_dir.glob("*"):
|
| 80 |
+
if img_path.is_file() and img_path.suffix.lower() in image_extensions:
|
| 81 |
+
try:
|
| 82 |
+
zf.write(img_path, f"images/{img_path.name}")
|
| 83 |
+
image_count += 1
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.warning(f"Failed to add image {img_path} to zip: {e}")
|
| 86 |
+
|
| 87 |
+
if image_count == 0:
|
| 88 |
+
return None, 0
|
| 89 |
+
|
| 90 |
+
return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _convert_document(
|
| 94 |
+
input_path: Path,
|
| 95 |
+
output_dir: Path,
|
| 96 |
+
include_images: bool,
|
| 97 |
+
request_id: str,
|
| 98 |
+
start_page: int = 0,
|
| 99 |
+
end_page: Optional[int] = None,
|
| 100 |
+
) -> tuple:
|
| 101 |
+
"""
|
| 102 |
+
PaddleOCR-VL-1.5 + Gemini hybrid conversion.
|
| 103 |
+
|
| 104 |
+
Pass 1 (GPU): PaddleOCR-VL-1.5 on full PDF (native document parsing)
|
| 105 |
+
Detect: Find table pages from layout analysis
|
| 106 |
+
Pass 2 (API): Gemini 3 Flash ONLY on table pages (high-quality tables)
|
| 107 |
+
Merge: Gemini for table pages, PaddleOCR for everything else
|
| 108 |
+
|
| 109 |
+
Returns: (markdown_content, json_content, pages_processed, image_count)
|
| 110 |
+
"""
|
| 111 |
+
overall_start = time.time()
|
| 112 |
+
|
| 113 |
+
# ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
|
| 114 |
+
pipeline = _get_pipeline()
|
| 115 |
+
paddle_start = time.time()
|
| 116 |
+
output = pipeline.predict(str(input_path))
|
| 117 |
+
paddle_time = time.time() - paddle_start
|
| 118 |
+
|
| 119 |
+
# Collect per-page markdown and detect table pages
|
| 120 |
+
page_markdowns = []
|
| 121 |
+
table_pages = set()
|
| 122 |
+
for i, res in enumerate(output):
|
| 123 |
+
md_data = res.markdown
|
| 124 |
+
page_markdowns.append(md_data)
|
| 125 |
+
# Check if this page has tables from layout analysis
|
| 126 |
+
if _page_has_tables(res):
|
| 127 |
+
table_pages.add(i)
|
| 128 |
+
|
| 129 |
+
logger.info(
|
| 130 |
+
f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
|
| 131 |
+
f"in {paddle_time:.2f}s β {len(table_pages)} table pages detected"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# ---- PASS 2: Gemini on table pages only ----
|
| 135 |
+
gemini_page_texts: dict[int, str] = {}
|
| 136 |
+
gemini_time = 0.0
|
| 137 |
+
|
| 138 |
+
if table_pages and GEMINI_API_KEY:
|
| 139 |
+
logger.info(
|
| 140 |
+
f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(table_pages)} table pages "
|
| 141 |
+
f"({GEMINI_CONCURRENCY} concurrent)"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Render table page images for Gemini
|
| 145 |
+
page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
|
| 146 |
+
page_image_map = {pno: pbytes for pno, pbytes in page_images}
|
| 147 |
+
|
| 148 |
+
gemini_start = time.time()
|
| 149 |
+
with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
|
| 150 |
+
futures = {
|
| 151 |
+
executor.submit(
|
| 152 |
+
_gemini_extract_page, page_image_map[pno], request_id, pno
|
| 153 |
+
): pno
|
| 154 |
+
for pno in table_pages
|
| 155 |
+
if pno in page_image_map
|
| 156 |
+
}
|
| 157 |
+
for future in as_completed(futures):
|
| 158 |
+
pno = futures[future]
|
| 159 |
+
try:
|
| 160 |
+
text = future.result()
|
| 161 |
+
if text:
|
| 162 |
+
gemini_page_texts[pno] = text
|
| 163 |
+
logger.info(
|
| 164 |
+
f"[{request_id}] Gemini processed table page {pno + 1} "
|
| 165 |
+
f"({len(text)} chars)"
|
| 166 |
+
)
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.warning(f"[{request_id}] Gemini failed page {pno + 1}: {e}")
|
| 169 |
+
|
| 170 |
+
gemini_time = time.time() - gemini_start
|
| 171 |
+
logger.info(
|
| 172 |
+
f"[{request_id}] Pass 2 completed in {gemini_time:.2f}s β "
|
| 173 |
+
f"{len(gemini_page_texts)}/{len(table_pages)} table pages enhanced via Gemini"
|
| 174 |
+
)
|
| 175 |
+
elif table_pages and not GEMINI_API_KEY:
|
| 176 |
+
logger.warning(
|
| 177 |
+
f"[{request_id}] {len(table_pages)} table pages detected but GEMINI_API_KEY not set β "
|
| 178 |
+
f"using PaddleOCR output for tables"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# ---- MERGE: Gemini for table pages, PaddleOCR for others ----
|
| 182 |
+
md_parts: list[str] = []
|
| 183 |
+
for i, md_data in enumerate(page_markdowns):
|
| 184 |
+
if i in gemini_page_texts:
|
| 185 |
+
md_parts.append(gemini_page_texts[i])
|
| 186 |
+
else:
|
| 187 |
+
# Extract markdown text from PaddleOCR result
|
| 188 |
+
if isinstance(md_data, dict):
|
| 189 |
+
md_text = md_data.get("markdown_texts", "")
|
| 190 |
+
else:
|
| 191 |
+
md_text = str(md_data)
|
| 192 |
+
md_parts.append(md_text)
|
| 193 |
+
|
| 194 |
+
markdown_content = "\n\n".join(md_parts)
|
| 195 |
+
|
| 196 |
+
# Post-process: fix cross-page artifacts, deduplicate headers, clean tables
|
| 197 |
+
pages_processed = len(page_markdowns)
|
| 198 |
+
if pages_processed > 1:
|
| 199 |
+
markdown_content = _post_process_merged_markdown(markdown_content)
|
| 200 |
+
|
| 201 |
+
total_time = time.time() - overall_start
|
| 202 |
+
|
| 203 |
+
logger.info(
|
| 204 |
+
f"[{request_id}] v5.0.0 conversion complete: {pages_processed} pages β "
|
| 205 |
+
f"PaddleOCR {paddle_time:.1f}s + Gemini {gemini_time:.1f}s = {total_time:.2f}s total"
|
| 206 |
+
)
|
| 207 |
+
if pages_processed > 0:
|
| 208 |
+
logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
|
| 209 |
+
|
| 210 |
+
return markdown_content, None, pages_processed, 0
|
postprocess.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Post-processing functions and regex patterns for markdown cleanup."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
# ---------------------------------------------------------------------------
|
| 6 |
+
# Post-processing regex patterns
|
| 7 |
+
# ---------------------------------------------------------------------------
|
| 8 |
+
|
| 9 |
+
# Day-of-week date lines (e.g., "Thursday, October 31, 2024")
|
| 10 |
+
_STANDALONE_DATE = re.compile(
|
| 11 |
+
r"^\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
|
| 12 |
+
r"(?:January|February|March|April|May|June|July|August|September|"
|
| 13 |
+
r"October|November|December)\s+\d{1,2},\s+\d{4}\s*$",
|
| 14 |
+
re.MULTILINE,
|
| 15 |
+
)
|
| 16 |
+
# Standalone time (e.g., "11:30 AM")
|
| 17 |
+
_STANDALONE_TIME = re.compile(r"^\s*\d{1,2}:\d{2}\s*(?:AM|PM)\s*$", re.MULTILINE)
|
| 18 |
+
# Page footer patterns: "N | address" or "N address N" (e.g., "2 | 8575 W Golf Rd, Niles, IL 60714 | 3")
|
| 19 |
+
_PAGE_FOOTER = re.compile(
|
| 20 |
+
r"^\s*\d{1,3}\s*\|?\s*\d{2,5}\s+\w.*(?:Rd|St|Ave|Blvd|Dr|Ln|Way|Ct)\b.*\d{5}.*$",
|
| 21 |
+
re.MULTILINE,
|
| 22 |
+
)
|
| 23 |
+
# Standalone page number lines (e.g., "12" alone on a line)
|
| 24 |
+
_STANDALONE_PAGE_NUM = re.compile(r"^\s*\d{1,3}\s*$", re.MULTILINE)
|
| 25 |
+
# Branding footer lines: "Text | Text | N" or "Text | Text - Text N" pattern
|
| 26 |
+
# Matches lines with 2+ pipe-separated segments ending in a page number,
|
| 27 |
+
# where total line length > 30 chars (to avoid matching short legitimate text)
|
| 28 |
+
_BRANDING_FOOTER = re.compile(
|
| 29 |
+
r"^\s*[A-Za-z][^|]{5,}\|[^|]+\|?\s*\d{1,3}\s*$",
|
| 30 |
+
re.MULTILINE,
|
| 31 |
+
)
|
| 32 |
+
# Short repeated location lines that appear as page artifacts (e.g., "Niles, IL" alone)
|
| 33 |
+
_SHORT_LOCATION_LINE = re.compile(
|
| 34 |
+
r"^\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s*$", re.MULTILINE
|
| 35 |
+
)
|
| 36 |
+
# Numbered section pattern: "N. TITLE" where N is 1-99 and TITLE is mostly uppercase
|
| 37 |
+
_NUMBERED_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][A-Z\s\-/&,]+(?:\.\s*)?)")
|
| 38 |
+
|
| 39 |
+
# Table row with ALL empty cells (e.g., "| | | | |")
|
| 40 |
+
_EMPTY_TABLE_ROW = re.compile(r"^\|(?:\s*\|)+\s*$", re.MULTILINE)
|
| 41 |
+
# Trailing empty cells in a table row (e.g., "| data | data | | | |")
|
| 42 |
+
_TRAILING_EMPTY_CELLS = re.compile(r"(?:\s*\|\s*){2,}\s*$")
|
| 43 |
+
# Table separator row (e.g., "|---|---|---|")
|
| 44 |
+
_TABLE_SEP_ROW = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Post-Processing Functions
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _post_process_merged_markdown(content: str) -> str:
|
| 53 |
+
"""Post-process merged multi-page markdown to fix cross-page artifacts.
|
| 54 |
+
|
| 55 |
+
Applied after all pages are concatenated. Fixes:
|
| 56 |
+
- Duplicate document headings (re-extracted page headers)
|
| 57 |
+
- Duplicate short metadata lines (subtitles, dates repeated per page)
|
| 58 |
+
- Page footer/header artifacts (standalone dates, times, page numbers)
|
| 59 |
+
- Numbered section heading normalization (consistent ## levels)
|
| 60 |
+
- Table artifacts (empty rows, trailing empty cells)
|
| 61 |
+
- Cross-page table continuations (merge split tables)
|
| 62 |
+
- Excessive whitespace
|
| 63 |
+
"""
|
| 64 |
+
content = _deduplicate_headings(content)
|
| 65 |
+
content = _deduplicate_short_blocks(content)
|
| 66 |
+
content = _remove_page_boundary_artifacts(content)
|
| 67 |
+
content = _normalize_numbered_headings(content)
|
| 68 |
+
content = _clean_table_artifacts(content)
|
| 69 |
+
content = _merge_split_tables(content)
|
| 70 |
+
# Normalize runs of 4+ newlines to 3
|
| 71 |
+
content = re.sub(r"\n{4,}", "\n\n\n", content)
|
| 72 |
+
return content.strip()
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _deduplicate_headings(content: str) -> str:
|
| 76 |
+
"""Remove duplicate heading lines, keeping only the first occurrence.
|
| 77 |
+
|
| 78 |
+
When processing each page, page headers/document titles may be re-extracted.
|
| 79 |
+
This removes exact duplicate headings while preserving table rows and body text.
|
| 80 |
+
"""
|
| 81 |
+
lines = content.split("\n")
|
| 82 |
+
seen_headings: set[str] = set()
|
| 83 |
+
result: list[str] = []
|
| 84 |
+
|
| 85 |
+
for line in lines:
|
| 86 |
+
stripped = line.strip()
|
| 87 |
+
if stripped.startswith("#"):
|
| 88 |
+
# Normalize heading for comparison (lowercase, strip trailing #)
|
| 89 |
+
key = stripped.lstrip("#").strip().lower()
|
| 90 |
+
if key and key in seen_headings:
|
| 91 |
+
continue # Skip duplicate heading
|
| 92 |
+
if key:
|
| 93 |
+
seen_headings.add(key)
|
| 94 |
+
result.append(line)
|
| 95 |
+
|
| 96 |
+
return "\n".join(result)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _deduplicate_short_blocks(content: str) -> str:
|
| 100 |
+
"""Remove duplicate short text blocks that repeat across pages.
|
| 101 |
+
|
| 102 |
+
When processing each page, document subtitles, metadata lines, and other
|
| 103 |
+
short repeating text may be re-extracted. This removes exact duplicates
|
| 104 |
+
of short non-table blocks (< 120 chars).
|
| 105 |
+
"""
|
| 106 |
+
blocks = content.split("\n\n")
|
| 107 |
+
seen: set[str] = set()
|
| 108 |
+
result: list[str] = []
|
| 109 |
+
|
| 110 |
+
for block in blocks:
|
| 111 |
+
stripped = block.strip()
|
| 112 |
+
if not stripped:
|
| 113 |
+
result.append(block)
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
# Only deduplicate short, non-table, non-heading blocks
|
| 117 |
+
is_table = stripped.startswith("|") and "|" in stripped[1:]
|
| 118 |
+
is_heading = stripped.startswith("#")
|
| 119 |
+
if is_table or is_heading or len(stripped) > 120:
|
| 120 |
+
result.append(block)
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
key = stripped.lower()
|
| 124 |
+
if key in seen:
|
| 125 |
+
continue # Skip duplicate short block
|
| 126 |
+
|
| 127 |
+
seen.add(key)
|
| 128 |
+
result.append(block)
|
| 129 |
+
|
| 130 |
+
return "\n\n".join(result)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _remove_page_boundary_artifacts(content: str) -> str:
|
| 134 |
+
"""Remove page footer/header artifacts like standalone dates, times, page numbers, and footers."""
|
| 135 |
+
content = _STANDALONE_DATE.sub("", content)
|
| 136 |
+
content = _STANDALONE_TIME.sub("", content)
|
| 137 |
+
content = _PAGE_FOOTER.sub("", content)
|
| 138 |
+
content = _STANDALONE_PAGE_NUM.sub("", content)
|
| 139 |
+
# Remove repeated patterns (only removed when they appear 3+ times)
|
| 140 |
+
content = _remove_repeated_lines(content, _BRANDING_FOOTER, min_repeats=3)
|
| 141 |
+
content = _remove_repeated_lines(content, _SHORT_LOCATION_LINE, min_repeats=3)
|
| 142 |
+
return content
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _remove_repeated_lines(content: str, pattern: re.Pattern, min_repeats: int = 3) -> str:
|
| 146 |
+
"""Remove lines matching a pattern that appear min_repeats+ times (clearly artifacts)."""
|
| 147 |
+
counts: dict[str, int] = {}
|
| 148 |
+
for m in pattern.finditer(content):
|
| 149 |
+
key = m.group(0).strip().lower()
|
| 150 |
+
counts[key] = counts.get(key, 0) + 1
|
| 151 |
+
|
| 152 |
+
repeated = {k for k, v in counts.items() if v >= min_repeats}
|
| 153 |
+
if not repeated:
|
| 154 |
+
return content
|
| 155 |
+
|
| 156 |
+
lines = content.split("\n")
|
| 157 |
+
result: list[str] = []
|
| 158 |
+
for line in lines:
|
| 159 |
+
if line.strip().lower() in repeated:
|
| 160 |
+
continue
|
| 161 |
+
result.append(line)
|
| 162 |
+
return "\n".join(result)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _normalize_numbered_headings(content: str) -> str:
|
| 166 |
+
"""Normalize numbered section headings to consistent ## level.
|
| 167 |
+
|
| 168 |
+
Inconsistently formatted numbered sections like "3. OCCUPANCY" β
|
| 169 |
+
some get ## headings, some are plain text. This detects the pattern
|
| 170 |
+
and ensures all numbered sections at the same level use ## headings.
|
| 171 |
+
"""
|
| 172 |
+
lines = content.split("\n")
|
| 173 |
+
result: list[str] = []
|
| 174 |
+
|
| 175 |
+
# First pass: detect which numbered sections exist and their heading status
|
| 176 |
+
sections_with_heading: set[int] = set()
|
| 177 |
+
sections_without_heading: set[int] = set()
|
| 178 |
+
|
| 179 |
+
for line in lines:
|
| 180 |
+
stripped = line.strip()
|
| 181 |
+
# Already a heading like "## 3. OCCUPANCY"
|
| 182 |
+
heading_match = re.match(r"^#{1,3}\s+(\d{1,2})\.\s+[A-Z]", stripped)
|
| 183 |
+
if heading_match:
|
| 184 |
+
sections_with_heading.add(int(heading_match.group(1)))
|
| 185 |
+
continue
|
| 186 |
+
# Plain text like "3. OCCUPANCY. Tenant shall..."
|
| 187 |
+
plain_match = _NUMBERED_SECTION.match(stripped)
|
| 188 |
+
if plain_match:
|
| 189 |
+
sections_without_heading.add(int(plain_match.group(1)))
|
| 190 |
+
|
| 191 |
+
# If there's a mix of headed and non-headed numbered sections, normalize
|
| 192 |
+
if sections_with_heading and sections_without_heading:
|
| 193 |
+
for i, line in enumerate(lines):
|
| 194 |
+
stripped = line.strip()
|
| 195 |
+
# Check if this is a non-headed numbered section that should be a heading
|
| 196 |
+
plain_match = _NUMBERED_SECTION.match(stripped)
|
| 197 |
+
if plain_match:
|
| 198 |
+
section_num = int(plain_match.group(1))
|
| 199 |
+
if section_num in sections_without_heading:
|
| 200 |
+
# Check that it looks like a section start (followed by text)
|
| 201 |
+
# Split at the first sentence end to make the heading
|
| 202 |
+
# Extract just "N. TITLE." as heading, keep body text
|
| 203 |
+
title_end = plain_match.end()
|
| 204 |
+
title = stripped[:title_end].rstrip(".")
|
| 205 |
+
body = stripped[title_end:].strip()
|
| 206 |
+
if body:
|
| 207 |
+
result.append(f"## {title}")
|
| 208 |
+
result.append(body)
|
| 209 |
+
else:
|
| 210 |
+
result.append(f"## {title}")
|
| 211 |
+
continue
|
| 212 |
+
result.append(line)
|
| 213 |
+
else:
|
| 214 |
+
result = lines
|
| 215 |
+
|
| 216 |
+
return "\n".join(result)
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def _clean_table_artifacts(content: str) -> str:
|
| 220 |
+
"""Clean table formatting artifacts.
|
| 221 |
+
|
| 222 |
+
- Removes table rows where ALL cells are empty
|
| 223 |
+
- Strips trailing empty cells from table rows
|
| 224 |
+
- Removes orphaned separator rows not preceded by a header
|
| 225 |
+
"""
|
| 226 |
+
lines = content.split("\n")
|
| 227 |
+
result: list[str] = []
|
| 228 |
+
|
| 229 |
+
for i, line in enumerate(lines):
|
| 230 |
+
stripped = line.strip()
|
| 231 |
+
|
| 232 |
+
# Skip completely empty table rows (| | | | |)
|
| 233 |
+
if _EMPTY_TABLE_ROW.match(stripped):
|
| 234 |
+
continue
|
| 235 |
+
|
| 236 |
+
# Clean trailing empty cells from table data rows
|
| 237 |
+
if stripped.startswith("|") and "|" in stripped[1:]:
|
| 238 |
+
# Don't touch separator rows
|
| 239 |
+
if not _TABLE_SEP_ROW.match(stripped):
|
| 240 |
+
# Remove trailing empty cells
|
| 241 |
+
cleaned = _TRAILING_EMPTY_CELLS.sub(" |", stripped)
|
| 242 |
+
result.append(cleaned)
|
| 243 |
+
continue
|
| 244 |
+
|
| 245 |
+
result.append(line)
|
| 246 |
+
|
| 247 |
+
return "\n".join(result)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def _is_table_line(line: str) -> bool:
|
| 251 |
+
"""Check if a line is a markdown table row or separator."""
|
| 252 |
+
s = line.strip()
|
| 253 |
+
return bool(s.startswith("|") and s.endswith("|") and s.count("|") >= 3)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def _count_columns(line: str) -> int:
|
| 257 |
+
"""Count the number of columns in a table row."""
|
| 258 |
+
s = line.strip()
|
| 259 |
+
if not s.startswith("|"):
|
| 260 |
+
return 0
|
| 261 |
+
# Split by | and count non-boundary segments
|
| 262 |
+
parts = s.split("|")
|
| 263 |
+
# First and last are empty strings from leading/trailing |
|
| 264 |
+
return max(0, len(parts) - 2)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def _merge_split_tables(content: str) -> str:
|
| 268 |
+
"""Merge table continuations that were split across pages.
|
| 269 |
+
|
| 270 |
+
Detects when non-table content (whitespace, duplicate metadata) separates
|
| 271 |
+
what should be a single table, and merges the data rows.
|
| 272 |
+
"""
|
| 273 |
+
lines = content.split("\n")
|
| 274 |
+
result: list[str] = []
|
| 275 |
+
i = 0
|
| 276 |
+
|
| 277 |
+
while i < len(lines):
|
| 278 |
+
result.append(lines[i])
|
| 279 |
+
i += 1
|
| 280 |
+
|
| 281 |
+
# Check if we just appended a table row and the next chunk looks like
|
| 282 |
+
# a table continuation (another table with similar column count)
|
| 283 |
+
if not _is_table_line(result[-1]):
|
| 284 |
+
continue
|
| 285 |
+
|
| 286 |
+
last_table_cols = _count_columns(result[-1])
|
| 287 |
+
if last_table_cols < 2:
|
| 288 |
+
continue
|
| 289 |
+
|
| 290 |
+
# Look ahead past empty lines / short non-table lines
|
| 291 |
+
j = i
|
| 292 |
+
gap_lines: list[str] = []
|
| 293 |
+
while j < len(lines):
|
| 294 |
+
s = lines[j].strip()
|
| 295 |
+
if s == "":
|
| 296 |
+
gap_lines.append(lines[j])
|
| 297 |
+
j += 1
|
| 298 |
+
continue
|
| 299 |
+
break
|
| 300 |
+
|
| 301 |
+
if j >= len(lines):
|
| 302 |
+
continue
|
| 303 |
+
|
| 304 |
+
# Check if the next non-empty line starts a table
|
| 305 |
+
if not _is_table_line(lines[j]):
|
| 306 |
+
continue
|
| 307 |
+
|
| 308 |
+
next_table_cols = _count_columns(lines[j])
|
| 309 |
+
|
| 310 |
+
# If column counts are close (within 30%), it's likely a continuation
|
| 311 |
+
if last_table_cols < 2 or next_table_cols < 2:
|
| 312 |
+
continue
|
| 313 |
+
ratio = min(last_table_cols, next_table_cols) / max(last_table_cols, next_table_cols)
|
| 314 |
+
if ratio < 0.7:
|
| 315 |
+
continue
|
| 316 |
+
|
| 317 |
+
# Check if the new table starts with header + separator (indicating
|
| 318 |
+
# re-extracted headers on the next page)
|
| 319 |
+
has_new_header = False
|
| 320 |
+
if _is_table_line(lines[j]):
|
| 321 |
+
# Look for a separator row in the next 1-2 lines
|
| 322 |
+
for k in range(j + 1, min(j + 3, len(lines))):
|
| 323 |
+
if _TABLE_SEP_ROW.match(lines[k].strip()):
|
| 324 |
+
has_new_header = True
|
| 325 |
+
break
|
| 326 |
+
|
| 327 |
+
if has_new_header:
|
| 328 |
+
# Skip the gap, skip the duplicate header + separator, keep data rows
|
| 329 |
+
# Find the separator row
|
| 330 |
+
skip_to = j
|
| 331 |
+
while skip_to < len(lines):
|
| 332 |
+
if _TABLE_SEP_ROW.match(lines[skip_to].strip()):
|
| 333 |
+
skip_to += 1 # Skip past separator
|
| 334 |
+
break
|
| 335 |
+
skip_to += 1
|
| 336 |
+
i = skip_to
|
| 337 |
+
else:
|
| 338 |
+
# No header β just skip the gap and append the continuation rows
|
| 339 |
+
i = j
|
| 340 |
+
|
| 341 |
+
return "\n".join(result)
|
rendering.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""PDF-to-page-images rendering and image preprocessing (CLAHE)."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
import time
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
import cv2
|
| 11 |
+
from pdf2image import convert_from_path
|
| 12 |
+
|
| 13 |
+
from config import RENDER_DPI, logger
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _preprocess_image_for_ocr(image_path: str) -> str:
|
| 17 |
+
"""Enhance image quality for better OCR accuracy.
|
| 18 |
+
|
| 19 |
+
Applies CLAHE contrast enhancement only (fast).
|
| 20 |
+
Denoising was removed in v3.2.1 β it added ~10s/page with minimal
|
| 21 |
+
benefit for VLM-based OCR which handles noise well.
|
| 22 |
+
"""
|
| 23 |
+
img = cv2.imread(image_path)
|
| 24 |
+
if img is None:
|
| 25 |
+
return image_path
|
| 26 |
+
|
| 27 |
+
# CLAHE contrast enhancement on L channel
|
| 28 |
+
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
|
| 29 |
+
l, a, b = cv2.split(lab)
|
| 30 |
+
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 31 |
+
l = clahe.apply(l)
|
| 32 |
+
lab = cv2.merge([l, a, b])
|
| 33 |
+
img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
|
| 34 |
+
|
| 35 |
+
cv2.imwrite(image_path, img)
|
| 36 |
+
return image_path
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _render_single_page(
|
| 40 |
+
input_path: Path, page_idx: int, dpi: int
|
| 41 |
+
) -> tuple[int, Optional[bytes]]:
|
| 42 |
+
"""Render a single PDF page to PNG bytes with CLAHE preprocessing.
|
| 43 |
+
|
| 44 |
+
Returns (page_idx, png_bytes) or (page_idx, None) on failure.
|
| 45 |
+
"""
|
| 46 |
+
try:
|
| 47 |
+
images = convert_from_path(
|
| 48 |
+
str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
|
| 49 |
+
)
|
| 50 |
+
if not images:
|
| 51 |
+
return page_idx, None
|
| 52 |
+
|
| 53 |
+
img = images[0]
|
| 54 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
| 55 |
+
tmp_path = tmp.name
|
| 56 |
+
img.save(tmp_path, format="PNG")
|
| 57 |
+
|
| 58 |
+
try:
|
| 59 |
+
_preprocess_image_for_ocr(tmp_path)
|
| 60 |
+
with open(tmp_path, "rb") as f:
|
| 61 |
+
return page_idx, f.read()
|
| 62 |
+
finally:
|
| 63 |
+
os.unlink(tmp_path)
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.warning(f"Failed to render page {page_idx + 1}: {e}")
|
| 66 |
+
return page_idx, None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _pdf_to_page_images(
|
| 70 |
+
input_path: Path,
|
| 71 |
+
request_id: str,
|
| 72 |
+
start_page: int = 0,
|
| 73 |
+
end_page: Optional[int] = None,
|
| 74 |
+
) -> list[tuple[int, bytes]]:
|
| 75 |
+
"""Convert PDF pages to PNG image bytes using parallel rendering.
|
| 76 |
+
|
| 77 |
+
Uses ThreadPoolExecutor for concurrent page rendering.
|
| 78 |
+
Returns list of (page_no, png_bytes) tuples, sorted by page number.
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
from pdf2image.pdf2image import pdfinfo_from_path
|
| 82 |
+
|
| 83 |
+
info = pdfinfo_from_path(str(input_path))
|
| 84 |
+
total_pages = info["Pages"]
|
| 85 |
+
last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.warning(f"[{request_id}] Could not get PDF info: {e}")
|
| 88 |
+
return []
|
| 89 |
+
|
| 90 |
+
page_indices = list(range(start_page, last_page))
|
| 91 |
+
|
| 92 |
+
start_time = time.time()
|
| 93 |
+
page_images: list[tuple[int, bytes]] = []
|
| 94 |
+
|
| 95 |
+
# Render pages in parallel (4 threads β I/O bound, not CPU bound for poppler)
|
| 96 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 97 |
+
futures = {
|
| 98 |
+
executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
|
| 99 |
+
for idx in page_indices
|
| 100 |
+
}
|
| 101 |
+
for future in as_completed(futures):
|
| 102 |
+
page_idx, png_bytes = future.result()
|
| 103 |
+
if png_bytes is not None:
|
| 104 |
+
page_images.append((page_idx, png_bytes))
|
| 105 |
+
|
| 106 |
+
page_images.sort(key=lambda x: x[0])
|
| 107 |
+
render_time = time.time() - start_time
|
| 108 |
+
logger.info(
|
| 109 |
+
f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
|
| 110 |
+
f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
|
| 111 |
+
)
|
| 112 |
+
return page_images
|
requirements.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
|
| 6 |
|
| 7 |
# Web framework
|
| 8 |
fastapi>=0.115.0
|
|
@@ -11,23 +11,17 @@ uvicorn[standard]>=0.32.0
|
|
| 11 |
# File upload handling
|
| 12 |
python-multipart>=0.0.9
|
| 13 |
|
| 14 |
-
# HTTP client for
|
| 15 |
httpx>=0.27.0
|
| 16 |
|
| 17 |
-
#
|
| 18 |
pydantic>=2.0.0
|
| 19 |
|
| 20 |
-
# Image preprocessing
|
| 21 |
opencv-python-headless>=4.10.0
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
onnxruntime>=1.19.0
|
| 25 |
-
|
| 26 |
-
# PDF to image conversion for VLM OCR pass
|
| 27 |
pdf2image>=1.17.0
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
pypdf>=4.0.0
|
| 31 |
-
|
| 32 |
-
# HuggingFace Hub for model downloads
|
| 33 |
huggingface-hub>=0.25.0
|
|
|
|
| 1 |
+
# PaddleOCR-VL-1.5 + Gemini Hybrid Parser API Dependencies
|
| 2 |
+
# PaddlePaddle GPU is installed separately in the Dockerfile (requires special index URL)
|
| 3 |
|
| 4 |
+
# PaddleOCR with document parsing support (PaddleOCR-VL-1.5)
|
| 5 |
+
paddleocr[doc-parser]
|
| 6 |
|
| 7 |
# Web framework
|
| 8 |
fastapi>=0.115.0
|
|
|
|
| 11 |
# File upload handling
|
| 12 |
python-multipart>=0.0.9
|
| 13 |
|
| 14 |
+
# HTTP client for Gemini API calls and URL fetching
|
| 15 |
httpx>=0.27.0
|
| 16 |
|
| 17 |
+
# Request/response models
|
| 18 |
pydantic>=2.0.0
|
| 19 |
|
| 20 |
+
# Image preprocessing (CLAHE contrast enhancement)
|
| 21 |
opencv-python-headless>=4.10.0
|
| 22 |
|
| 23 |
+
# PDF page rendering for Gemini page images
|
|
|
|
|
|
|
|
|
|
| 24 |
pdf2image>=1.17.0
|
| 25 |
|
| 26 |
+
# Model utilities
|
|
|
|
|
|
|
|
|
|
| 27 |
huggingface-hub>=0.25.0
|
start.sh
CHANGED
|
@@ -1,84 +1,7 @@
|
|
| 1 |
-
#!/
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
#
|
| 5 |
-
exec
|
| 6 |
-
|
| 7 |
-
echo "[startup] ====== Docling VLM Parser starting ======"
|
| 8 |
-
echo "[startup] Date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
|
| 9 |
-
echo "[startup] GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>&1 || echo 'NO GPU')"
|
| 10 |
-
echo "[startup] HF cache: $(du -sh /home/user/.cache/huggingface 2>/dev/null || echo 'empty')"
|
| 11 |
-
|
| 12 |
-
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 13 |
-
VLLM_MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
|
| 14 |
-
VLLM_HOST="127.0.0.1"
|
| 15 |
-
VLLM_PORT="8000"
|
| 16 |
-
HEALTH_URL="http://${VLLM_HOST}:${VLLM_PORT}/health"
|
| 17 |
-
POLL_INTERVAL=5
|
| 18 |
-
MAX_WAIT=600
|
| 19 |
-
|
| 20 |
-
# ββ Start vLLM server in background βββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
-
echo "[startup] Starting vLLM server with model: ${VLLM_MODEL}"
|
| 22 |
-
|
| 23 |
-
python3 -m vllm.entrypoints.openai.api_server \
|
| 24 |
-
--model "${VLLM_MODEL}" \
|
| 25 |
-
--host "${VLLM_HOST}" \
|
| 26 |
-
--port "${VLLM_PORT}" \
|
| 27 |
-
--max-num-seqs 16 \
|
| 28 |
-
--max-model-len 65536 \
|
| 29 |
-
--gpu-memory-utilization 0.85 \
|
| 30 |
-
--dtype auto \
|
| 31 |
-
--trust-remote-code \
|
| 32 |
-
--limit-mm-per-prompt '{"image": 1}' \
|
| 33 |
-
2>&1 &
|
| 34 |
-
|
| 35 |
-
VLLM_PID=$!
|
| 36 |
-
echo "[startup] vLLM server started with PID ${VLLM_PID}"
|
| 37 |
-
|
| 38 |
-
# ββ Poll vLLM health endpoint until ready ββββββββββββββββββββββββββββββββββββ
|
| 39 |
-
echo "[startup] Waiting for vLLM to become healthy (polling every ${POLL_INTERVAL}s, timeout ${MAX_WAIT}s)..."
|
| 40 |
-
|
| 41 |
-
elapsed=0
|
| 42 |
-
while [ "${elapsed}" -lt "${MAX_WAIT}" ]; do
|
| 43 |
-
# Check if vLLM process is still alive
|
| 44 |
-
if ! kill -0 "${VLLM_PID}" 2>/dev/null; then
|
| 45 |
-
echo "[startup] ERROR: vLLM process (PID ${VLLM_PID}) died during startup"
|
| 46 |
-
exit 1
|
| 47 |
-
fi
|
| 48 |
-
|
| 49 |
-
if curl -sf "${HEALTH_URL}" > /dev/null 2>&1; then
|
| 50 |
-
echo "[startup] vLLM is healthy after ${elapsed}s"
|
| 51 |
-
break
|
| 52 |
-
fi
|
| 53 |
-
|
| 54 |
-
# Heartbeat every 30s
|
| 55 |
-
if [ $((elapsed % 30)) -eq 0 ] && [ "${elapsed}" -gt 0 ]; then
|
| 56 |
-
echo "[startup] Still waiting for vLLM... ${elapsed}s elapsed"
|
| 57 |
-
fi
|
| 58 |
-
|
| 59 |
-
sleep "${POLL_INTERVAL}"
|
| 60 |
-
elapsed=$((elapsed + POLL_INTERVAL))
|
| 61 |
-
done
|
| 62 |
-
|
| 63 |
-
if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then
|
| 64 |
-
echo "[startup] ERROR: vLLM did not become healthy within ${MAX_WAIT}s"
|
| 65 |
-
echo "[startup] Killing vLLM process (PID ${VLLM_PID})"
|
| 66 |
-
kill "${VLLM_PID}" 2>/dev/null || true
|
| 67 |
-
exit 1
|
| 68 |
-
fi
|
| 69 |
-
|
| 70 |
-
# ββ Start FastAPI with vLLM cleanup on exit ββββββββββββββββββββββββββββββββββ
|
| 71 |
-
_cleanup() {
|
| 72 |
-
echo "[startup] Shutting down vLLM (PID ${VLLM_PID})"
|
| 73 |
-
kill "${VLLM_PID}" 2>/dev/null
|
| 74 |
-
wait "${VLLM_PID}" 2>/dev/null
|
| 75 |
-
}
|
| 76 |
-
trap _cleanup EXIT TERM INT
|
| 77 |
-
|
| 78 |
-
echo "[startup] Starting FastAPI server on 0.0.0.0:7860"
|
| 79 |
-
|
| 80 |
-
python3 -m uvicorn app:app \
|
| 81 |
-
--host 0.0.0.0 \
|
| 82 |
-
--port 7860 \
|
| 83 |
-
--workers 1 \
|
| 84 |
-
--timeout-keep-alive 300
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Start the PaddleOCR-VL + Gemini hybrid parser API
|
| 3 |
+
# Single process: FastAPI with PaddleOCR-VL-1.5 loaded in-process
|
| 4 |
+
# Note: Dockerfile should ensure this script is executable (chmod +x)
|
| 5 |
|
| 6 |
+
# Start FastAPI
|
| 7 |
+
exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|