sidoutcome commited on
Commit
16b2195
Β·
1 Parent(s): ba23da1

feat: v5.0.0 PaddleOCR-VL-1.5 + Gemini hybrid architecture

Browse files

- Replace Qwen3-VL + Docling with PaddleOCR-VL-1.5 (0.9B params, #1 OmniDocBench 94.5%)
- Keep Gemini 3 Flash for table page enhancement only
- Split monolithic app.py into 8 focused modules
- Switch from A100 to T4 GPU (84% cost reduction)
- Native cross-page table merging via PP-DocLayoutV2
- Enhanced post-processing: footer/artifact removal, table cleanup

Files changed (11) hide show
  1. Dockerfile +41 -38
  2. app.py +51 -1556
  3. auth.py +89 -0
  4. config.py +37 -0
  5. gemini.py +132 -0
  6. models.py +40 -0
  7. pipeline.py +210 -0
  8. postprocess.py +341 -0
  9. rendering.py +112 -0
  10. requirements.txt +9 -15
  11. start.sh +6 -83
Dockerfile CHANGED
@@ -1,9 +1,13 @@
1
- # Hugging Face Spaces Dockerfile for Docling VLM Document Parser API
2
- # GPU-accelerated document parsing with Docling + Qwen3-VL-30B-A3B via vLLM
3
- # Build: v2.0.0 - Docling with VLM backend for superior accuracy
 
 
 
 
4
 
5
- # Use vLLM base image with CUDA, PyTorch, and vLLM pre-installed
6
- FROM vllm/vllm-openai:v0.14.1
7
 
8
  USER root
9
 
@@ -12,18 +16,26 @@ RUN echo "========== BUILD STARTED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') ======
12
  # Install system dependencies
13
  RUN echo "========== STEP 1: Installing system dependencies ==========" && \
14
  apt-get update && apt-get install -y --no-install-recommends \
 
 
 
 
 
15
  # Fonts for document rendering
16
  fonts-noto-core \
17
  fonts-noto-cjk \
18
  fontconfig \
19
- # Image processing
20
  libgl1 \
21
  libglib2.0-0 \
22
- # PDF utilities
23
  poppler-utils \
24
  # Health checks
25
  curl \
26
  && fc-cache -fv && \
 
 
 
27
  rm -rf /var/lib/apt/lists/* && \
28
  echo "========== System dependencies installed =========="
29
 
@@ -33,24 +45,17 @@ RUN useradd -m -u 1000 user
33
  # Set environment variables
34
  ENV PYTHONUNBUFFERED=1 \
35
  PYTHONDONTWRITEBYTECODE=1 \
36
- VLM_MODEL=Qwen/Qwen3-VL-30B-A3B-Instruct \
37
- VLM_HOST=127.0.0.1 \
38
- VLM_PORT=8000 \
39
- VLM_GPU_MEMORY_UTILIZATION=0.85 \
40
- VLM_MAX_MODEL_LEN=65536 \
41
  IMAGES_SCALE=2.0 \
42
  MAX_FILE_SIZE_MB=1024 \
43
  HF_HOME=/home/user/.cache/huggingface \
44
- TORCH_HOME=/home/user/.cache/torch \
45
  XDG_CACHE_HOME=/home/user/.cache \
46
  HOME=/home/user \
47
- PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH \
48
- LD_LIBRARY_PATH=/home/user/.local/lib/python3.12/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
49
 
50
  # Create cache directories with correct ownership
51
  RUN echo "========== STEP 2: Creating cache directories ==========" && \
52
  mkdir -p /home/user/.cache/huggingface \
53
- /home/user/.cache/torch \
54
  /home/user/app && \
55
  chown -R user:user /home/user && \
56
  echo "========== Cache directories created =========="
@@ -62,30 +67,29 @@ WORKDIR /home/user/app
62
  # Copy requirements first for better caching
63
  COPY --chown=user:user requirements.txt .
64
 
65
- # Install Python dependencies
66
- RUN echo "========== STEP 3: Installing Python dependencies ==========" && \
67
- pip install --user --upgrade pip && \
68
- pip install --user nvidia-cudnn-cu12 && \
69
- pip install --user -r requirements.txt && \
 
 
 
 
 
 
70
  echo "Installed packages:" && \
71
  pip list --user && \
72
  echo "========== Python dependencies installed =========="
73
 
74
- # Pre-download Qwen3-VL-30B-A3B model for vLLM (use default HF cache so vLLM resolves by repo ID)
75
- RUN echo "========== STEP 4: Pre-downloading Qwen3-VL-30B-A3B model ==========" && \
76
- python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen3-VL-30B-A3B-Instruct')" && \
77
  echo "Model cache summary:" && \
 
78
  du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
79
- echo "========== Qwen3-VL-30B-A3B model downloaded =========="
80
-
81
- # Pre-download Docling models
82
- RUN echo "========== STEP 5: Pre-downloading Docling models ==========" && \
83
- python3 -c "from docling.document_converter import DocumentConverter; print('Downloading Docling models...'); converter = DocumentConverter(); print('Done')" && \
84
- echo "Model cache summary:" && \
85
- du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
86
- du -sh /home/user/.cache/torch 2>/dev/null || echo " Torch cache: (empty)" && \
87
  du -sh /home/user/.cache 2>/dev/null || echo " Total cache: (empty)" && \
88
- echo "========== Docling models downloaded =========="
89
 
90
  # Copy application code
91
  COPY --chown=user:user . .
@@ -95,13 +99,12 @@ RUN echo "========== STEP 6: Finalizing build ==========" && \
95
  echo "Files in app directory:" && ls -la /home/user/app/ && \
96
  echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="
97
 
98
- # Expose the port
99
  EXPOSE 7860
100
 
101
- # Health check (longer start-period for vLLM model loading)
102
- HEALTHCHECK --interval=30s --timeout=30s --start-period=600s --retries=5 \
103
  CMD curl -f http://localhost:7860/ || exit 1
104
 
105
- # Override vLLM entrypoint and use our startup script
106
- ENTRYPOINT []
107
  CMD ["/bin/bash", "/home/user/app/start.sh"]
 
1
+ # Hugging Face Spaces Dockerfile for PaddleOCR-VL Document Parser API
2
+ # GPU-accelerated document parsing with PaddleOCR-VL-1.5 + PaddlePaddle
3
+ # Build: v5.0.0 - PaddleOCR-VL for high-quality OCR on Nvidia T4
4
+ #
5
+ # NOTE: Run with --shm-size 16g for PaddlePaddle shared memory:
6
+ # docker build -t hf-docling .
7
+ # docker run --gpus all --shm-size 16g -p 7860:7860 -e API_TOKEN=test hf-docling
8
 
9
+ # CUDA 12.6 runtime with cuDNN (required by PaddlePaddle GPU)
10
+ FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04
11
 
12
  USER root
13
 
 
16
  # Install system dependencies
17
  RUN echo "========== STEP 1: Installing system dependencies ==========" && \
18
  apt-get update && apt-get install -y --no-install-recommends \
19
+ # Python 3.11
20
+ python3.11 \
21
+ python3.11-venv \
22
+ python3.11-dev \
23
+ python3-pip \
24
  # Fonts for document rendering
25
  fonts-noto-core \
26
  fonts-noto-cjk \
27
  fontconfig \
28
+ # Image processing (required by OpenCV)
29
  libgl1 \
30
  libglib2.0-0 \
31
+ # PDF utilities (required by pdf2image)
32
  poppler-utils \
33
  # Health checks
34
  curl \
35
  && fc-cache -fv && \
36
+ # Set python3.11 as default python3/python
37
+ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
38
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
39
  rm -rf /var/lib/apt/lists/* && \
40
  echo "========== System dependencies installed =========="
41
 
 
45
  # Set environment variables
46
  ENV PYTHONUNBUFFERED=1 \
47
  PYTHONDONTWRITEBYTECODE=1 \
 
 
 
 
 
48
  IMAGES_SCALE=2.0 \
49
  MAX_FILE_SIZE_MB=1024 \
50
  HF_HOME=/home/user/.cache/huggingface \
 
51
  XDG_CACHE_HOME=/home/user/.cache \
52
  HOME=/home/user \
53
+ PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
 
54
 
55
  # Create cache directories with correct ownership
56
  RUN echo "========== STEP 2: Creating cache directories ==========" && \
57
  mkdir -p /home/user/.cache/huggingface \
58
+ /home/user/.cache/paddleocr \
59
  /home/user/app && \
60
  chown -R user:user /home/user && \
61
  echo "========== Cache directories created =========="
 
67
  # Copy requirements first for better caching
68
  COPY --chown=user:user requirements.txt .
69
 
70
+ # Install PaddlePaddle GPU (must be installed before paddleocr)
71
+ RUN echo "========== STEP 3: Installing PaddlePaddle GPU ==========" && \
72
+ python -m pip install --user --upgrade pip && \
73
+ python -m pip install --user paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ && \
74
+ echo "PaddlePaddle version:" && \
75
+ python -c "import paddle; print(paddle.__version__); print('CUDA:', paddle.is_compiled_with_cuda())" && \
76
+ echo "========== PaddlePaddle GPU installed =========="
77
+
78
+ # Install Python dependencies from requirements.txt
79
+ RUN echo "========== STEP 4: Installing Python dependencies ==========" && \
80
+ python -m pip install --user -r requirements.txt && \
81
  echo "Installed packages:" && \
82
  pip list --user && \
83
  echo "========== Python dependencies installed =========="
84
 
85
+ # Pre-download PaddleOCR-VL-1.5 model at build time (avoids download on first request)
86
+ RUN echo "========== STEP 5: Pre-downloading PaddleOCR-VL-1.5 model ==========" && \
87
+ python -c "from paddleocr import PaddleOCRVL; PaddleOCRVL()" && \
88
  echo "Model cache summary:" && \
89
+ du -sh /home/user/.cache/paddleocr 2>/dev/null || echo " PaddleOCR cache: (empty)" && \
90
  du -sh /home/user/.cache/huggingface 2>/dev/null || echo " HF cache: (empty)" && \
 
 
 
 
 
 
 
 
91
  du -sh /home/user/.cache 2>/dev/null || echo " Total cache: (empty)" && \
92
+ echo "========== PaddleOCR-VL-1.5 model downloaded =========="
93
 
94
  # Copy application code
95
  COPY --chown=user:user . .
 
99
  echo "Files in app directory:" && ls -la /home/user/app/ && \
100
  echo "========== BUILD COMPLETED at $(date -u '+%Y-%m-%d %H:%M:%S UTC') =========="
101
 
102
+ # Expose the port (HF Spaces standard)
103
  EXPOSE 7860
104
 
105
+ # Health check
106
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=120s --retries=5 \
107
  CMD curl -f http://localhost:7860/ || exit 1
108
 
109
+ # Single-process FastAPI app (no vLLM sidecar needed)
 
110
  CMD ["/bin/bash", "/home/user/app/start.sh"]
app.py CHANGED
@@ -1,1512 +1,54 @@
1
  """
2
- Docling VLM Parser API v4.0.0
3
-
4
- A FastAPI service using a VLM + Gemini hybrid architecture for document parsing:
5
- Pass 1 (GPU): Qwen3-VL via vLLM β€” concurrent OCR on ALL pages (fast text extraction)
6
- Detect: Identify pages with tables from VLM markdown output
7
- Pass 2 (API): Gemini 2.5 Flash on table pages ONLY (superior table extraction)
8
- Merge: VLM text for non-table pages + Gemini output for table pages
9
- Post: Cross-page artifact removal, table cleanup, deduplication
10
-
11
- v4.0.0 β€” Gemini table extraction:
12
- - Quality: Gemini 2.5 Flash replaces Docling TableFormer for table pages
13
- - Quality: Table pages use Gemini's full output (text + tables) for best quality
14
- - Speed: No more CPU-bound Docling pipeline β€” Gemini API is fast
15
- - Quality: DPI 200 for clear page images sent to Gemini
16
- - Quality: Post-processing removes cross-page artifacts, deduplicates, cleans tables
17
  """
18
 
19
  import asyncio
20
- import base64
21
- import io
22
- import ipaddress
23
- import logging
24
- import os
25
  import re
26
- import secrets
27
  import shutil
28
- import socket
29
  import tempfile
30
  import time
31
- import zipfile
32
- from concurrent.futures import ThreadPoolExecutor, as_completed
33
  from contextlib import asynccontextmanager
34
  from pathlib import Path
35
- from typing import BinaryIO, Optional, Union
36
- from urllib.parse import urlparse
37
  from uuid import uuid4
38
 
39
- import cv2
40
  import httpx
41
- import torch
42
  from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
43
- from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
44
- from pdf2image import convert_from_path
45
- from pydantic import BaseModel
46
-
47
- # Docling imports
48
- from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
49
- from docling.datamodel.base_models import InputFormat
50
- from docling.datamodel.document import PictureItem, TableItem
51
- from docling.datamodel.pipeline_options import (
52
- AcceleratorOptions,
53
- PdfPipelineOptions,
54
- RapidOcrOptions,
55
- TableFormerMode,
56
- )
57
- from docling.document_converter import DocumentConverter, PdfFormatOption
58
-
59
- # Configure logging
60
- logging.basicConfig(
61
- level=logging.INFO,
62
- format="%(asctime)s | %(levelname)-8s | %(message)s",
63
- datefmt="%Y-%m-%d %H:%M:%S",
64
- )
65
- logger = logging.getLogger("docling-parser")
66
-
67
- # Security
68
- API_TOKEN = os.getenv("API_TOKEN")
69
- security = HTTPBearer()
70
-
71
-
72
- def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
73
- """Verify the API token from Authorization header."""
74
- if not API_TOKEN:
75
- raise HTTPException(
76
- status_code=500,
77
- detail="No API token configured on server",
78
- )
79
-
80
- token = credentials.credentials
81
- if not secrets.compare_digest(token, API_TOKEN):
82
- raise HTTPException(
83
- status_code=401,
84
- detail="Invalid API token",
85
- )
86
- return token
87
-
88
-
89
- # VLM Configuration
90
- VLM_MODEL = os.getenv("VLM_MODEL", "Qwen/Qwen3-VL-30B-A3B-Instruct")
91
- VLM_HOST = os.getenv("VLM_HOST", "127.0.0.1")
92
- VLM_PORT = os.getenv("VLM_PORT", "8000")
93
- IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
94
- MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
95
- MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
96
- VLM_TIMEOUT = float(os.getenv("VLM_TIMEOUT", "300"))
97
- VLM_CONCURRENCY = int(os.getenv("VLM_CONCURRENCY", "4"))
98
- RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
99
-
100
- # Gemini API Configuration (for table page extraction)
101
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
102
- GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
103
- GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
104
- GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "4"))
105
-
106
- # Blocked hostnames for SSRF protection
107
- BLOCKED_HOSTNAMES = {
108
- "localhost",
109
- "metadata",
110
- "metadata.google.internal",
111
- "metadata.google",
112
- "169.254.169.254",
113
- "fd00:ec2::254",
114
- }
115
-
116
- # Global converter instance (initialized on startup)
117
- _converter: Optional[DocumentConverter] = None
118
-
119
-
120
- def _get_device() -> str:
121
- """Get the best available device for processing."""
122
- if torch.cuda.is_available():
123
- return "cuda"
124
- elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
125
- return "mps"
126
- return "cpu"
127
-
128
-
129
- def _validate_url(url: str) -> None:
130
- """Validate URL to prevent SSRF attacks."""
131
- try:
132
- parsed = urlparse(url)
133
- except Exception as e:
134
- raise HTTPException(
135
- status_code=400,
136
- detail=f"Invalid URL format: {str(e)}",
137
- )
138
-
139
- if parsed.scheme not in ("http", "https"):
140
- raise HTTPException(
141
- status_code=400,
142
- detail=f"Invalid URL scheme '{parsed.scheme}'. Only http and https are allowed.",
143
- )
144
-
145
- hostname = parsed.hostname
146
- if not hostname:
147
- raise HTTPException(
148
- status_code=400,
149
- detail="Invalid URL: missing hostname.",
150
- )
151
-
152
- hostname_lower = hostname.lower()
153
- if hostname_lower in BLOCKED_HOSTNAMES:
154
- raise HTTPException(
155
- status_code=400,
156
- detail="Access to internal/metadata services is not allowed.",
157
- )
158
-
159
- blocked_patterns = ["metadata", "internal", "localhost", "127.0.0.1", "::1"]
160
- for pattern in blocked_patterns:
161
- if pattern in hostname_lower:
162
- raise HTTPException(
163
- status_code=400,
164
- detail="Access to internal/metadata services is not allowed.",
165
- )
166
-
167
- try:
168
- ip_str = socket.gethostbyname(hostname)
169
- ip = ipaddress.ip_address(ip_str)
170
- except socket.gaierror:
171
- raise HTTPException(
172
- status_code=400,
173
- detail=f"Could not resolve hostname: {hostname}",
174
- )
175
- except ValueError as e:
176
- raise HTTPException(
177
- status_code=400,
178
- detail=f"Invalid IP address resolved: {str(e)}",
179
- )
180
-
181
- if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
182
- raise HTTPException(
183
- status_code=400,
184
- detail="Access to private/internal IP addresses is not allowed.",
185
- )
186
-
187
-
188
- def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
189
- """Sync helper to save uploaded file to disk."""
190
- with open(input_path, "wb") as f:
191
- shutil.copyfileobj(file_obj, f)
192
-
193
-
194
- def _save_downloaded_content(input_path: Path, content: bytes) -> None:
195
- """Sync helper to save downloaded content to disk."""
196
- with open(input_path, "wb") as f:
197
- f.write(content)
198
-
199
-
200
- # ---------------------------------------------------------------------------
201
- # Pydantic Models
202
- # ---------------------------------------------------------------------------
203
-
204
-
205
- class ParseResponse(BaseModel):
206
- """Response model for document parsing."""
207
-
208
- success: bool
209
- markdown: Optional[str] = None
210
- json_content: Optional[Union[dict, list]] = None
211
- images_zip: Optional[str] = None
212
- image_count: int = 0
213
- error: Optional[str] = None
214
- pages_processed: int = 0
215
- device_used: Optional[str] = None
216
- vlm_model: Optional[str] = None
217
-
218
-
219
- class HealthResponse(BaseModel):
220
- """Health check response."""
221
-
222
- status: str
223
- version: str
224
- device: str
225
- gpu_name: Optional[str] = None
226
- vlm_model: str = ""
227
- vlm_status: str = "unknown"
228
- images_scale: float = 2.0
229
-
230
-
231
- class URLParseRequest(BaseModel):
232
- """Request model for URL-based parsing."""
233
-
234
- url: str
235
- output_format: str = "markdown"
236
- images_scale: Optional[float] = None
237
- start_page: int = 0
238
- end_page: Optional[int] = None
239
- include_images: bool = False
240
-
241
-
242
- # ---------------------------------------------------------------------------
243
- # OpenCV Image Preprocessing (CLAHE only β€” fast)
244
- # ---------------------------------------------------------------------------
245
-
246
-
247
- def _preprocess_image_for_ocr(image_path: str) -> str:
248
- """Enhance image quality for better OCR accuracy.
249
-
250
- Applies CLAHE contrast enhancement only (fast).
251
- Denoising was removed in v3.2.1 β€” it added ~10s/page with minimal
252
- benefit for VLM-based OCR which handles noise well.
253
- """
254
- img = cv2.imread(image_path)
255
- if img is None:
256
- return image_path
257
-
258
- # CLAHE contrast enhancement on L channel
259
- lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
260
- l, a, b = cv2.split(lab)
261
- clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
262
- l = clahe.apply(l)
263
- lab = cv2.merge([l, a, b])
264
- img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
265
-
266
- cv2.imwrite(image_path, img)
267
- return image_path
268
-
269
-
270
- # ---------------------------------------------------------------------------
271
- # VLM OCR with retry
272
- # ---------------------------------------------------------------------------
273
-
274
- # Strip Qwen3 <think>...</think> reasoning blocks
275
- _THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
276
-
277
- # Post-processing patterns for VLM output cleanup
278
- _CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
279
- _CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
280
- _HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
281
- _PAGE_N_PATTERN = re.compile(r"^\s*Page\s+\d+\s*$\n?", re.MULTILINE)
282
-
283
-
284
- def _clean_vlm_output(content: str) -> str:
285
- """Post-process VLM output to clean artifacts.
286
-
287
- Removes: code fences, HTML comments, 'Page N' artifacts,
288
- and converts any remaining LaTeX tables to markdown format.
289
- """
290
- # Strip <think> blocks
291
- content = _THINK_PATTERN.sub("", content).strip()
292
-
293
- # Strip code fence wrappers
294
- content = _CODE_FENCE_PATTERN.sub("", content)
295
- content = _CODE_FENCE_END.sub("", content)
296
-
297
- # Strip HTML comments (VLM sometimes adds coordinate annotations)
298
- content = _HTML_COMMENT_PATTERN.sub("", content)
299
-
300
- # Strip "Page N" artifacts
301
- content = _PAGE_N_PATTERN.sub("", content)
302
-
303
- # Fix escaped quotes (VLM sometimes escapes them unnecessarily)
304
- content = content.replace('\\"', '"')
305
-
306
- # Convert LaTeX tables to markdown if VLM ignores the prompt
307
- content = _convert_latex_tables_to_markdown(content)
308
-
309
- return content.strip()
310
 
311
-
312
- def _convert_latex_tables_to_markdown(text: str) -> str:
313
- """Convert LaTeX tabular environments to markdown pipe tables."""
314
- latex_pattern = re.compile(
315
- r"\\begin\{tabular\}\{[^}]*\}(.*?)\\end\{tabular\}", re.DOTALL
316
- )
317
-
318
- def _latex_to_md(match: re.Match) -> str:
319
- body = match.group(1)
320
- # Remove \hline
321
- body = re.sub(r"\\hline\s*", "", body)
322
- # Split on \\
323
- rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
324
- if not rows:
325
- return match.group(0)
326
-
327
- md_rows = []
328
- for i, row in enumerate(rows):
329
- cells = [c.strip() for c in row.split("&")]
330
- md_row = "| " + " | ".join(cells) + " |"
331
- md_rows.append(md_row)
332
- if i == 0:
333
- # Add separator after header
334
- sep = "| " + " | ".join(["---"] * len(cells)) + " |"
335
- md_rows.append(sep)
336
-
337
- return "\n".join(md_rows)
338
-
339
- return latex_pattern.sub(_latex_to_md, text)
340
-
341
-
342
- # ---------------------------------------------------------------------------
343
- # Post-Processing: Cross-page artifact removal (applied AFTER page merge)
344
- # ---------------------------------------------------------------------------
345
-
346
- # Day-of-week date lines (e.g., "Thursday, October 31, 2024")
347
- _STANDALONE_DATE = re.compile(
348
- r"^\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
349
- r"(?:January|February|March|April|May|June|July|August|September|"
350
- r"October|November|December)\s+\d{1,2},\s+\d{4}\s*$",
351
- re.MULTILINE,
352
- )
353
- # Standalone time (e.g., "11:30 AM")
354
- _STANDALONE_TIME = re.compile(r"^\s*\d{1,2}:\d{2}\s*(?:AM|PM)\s*$", re.MULTILINE)
355
- # Page footer patterns: "N | address" or "N address N" (e.g., "2 | 8575 W Golf Rd, Niles, IL 60714 | 3")
356
- _PAGE_FOOTER = re.compile(
357
- r"^\s*\d{1,3}\s*\|?\s*\d{2,5}\s+\w.*(?:Rd|St|Ave|Blvd|Dr|Ln|Way|Ct)\b.*\d{5}.*$",
358
- re.MULTILINE,
359
- )
360
- # Standalone page number lines (e.g., "12" alone on a line)
361
- _STANDALONE_PAGE_NUM = re.compile(r"^\s*\d{1,3}\s*$", re.MULTILINE)
362
- # Numbered section pattern: "N. TITLE" where N is 1-99 and TITLE is mostly uppercase
363
- _NUMBERED_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][A-Z\s\-/&,]+(?:\.\s*)?)")
364
- # Table row with ALL empty cells (e.g., "| | | | |")
365
- _EMPTY_TABLE_ROW = re.compile(r"^\|(?:\s*\|)+\s*$", re.MULTILINE)
366
- # Trailing empty cells in a table row (e.g., "| data | data | | | |")
367
- _TRAILING_EMPTY_CELLS = re.compile(r"(?:\s*\|\s*){2,}\s*$")
368
- # Table separator row (e.g., "|---|---|---|")
369
- _TABLE_SEP_ROW = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$")
370
-
371
-
372
- def _post_process_merged_markdown(content: str) -> str:
373
- """Post-process merged multi-page markdown to fix cross-page artifacts.
374
-
375
- Applied after all pages are concatenated. Fixes:
376
- - Duplicate document headings (VLM re-extracts page headers)
377
- - Duplicate short metadata lines (subtitles, dates repeated per page)
378
- - Page footer/header artifacts (standalone dates, times, page numbers)
379
- - Numbered section heading normalization (consistent ## levels)
380
- - Table artifacts (empty rows, trailing empty cells)
381
- - Cross-page table continuations (merge split tables)
382
- - Excessive whitespace
383
- """
384
- content = _deduplicate_headings(content)
385
- content = _deduplicate_short_blocks(content)
386
- content = _remove_page_boundary_artifacts(content)
387
- content = _normalize_numbered_headings(content)
388
- content = _clean_table_artifacts(content)
389
- content = _merge_split_tables(content)
390
- # Normalize runs of 4+ newlines to 3
391
- content = re.sub(r"\n{4,}", "\n\n\n", content)
392
- return content.strip()
393
-
394
-
395
- def _deduplicate_headings(content: str) -> str:
396
- """Remove duplicate heading lines, keeping only the first occurrence.
397
-
398
- When VLM processes each page, it may re-extract page headers/document titles.
399
- This removes exact duplicate headings while preserving table rows and body text.
400
- """
401
- lines = content.split("\n")
402
- seen_headings: set[str] = set()
403
- result: list[str] = []
404
-
405
- for line in lines:
406
- stripped = line.strip()
407
- if stripped.startswith("#"):
408
- # Normalize heading for comparison (lowercase, strip trailing #)
409
- key = stripped.lstrip("#").strip().lower()
410
- if key and key in seen_headings:
411
- continue # Skip duplicate heading
412
- if key:
413
- seen_headings.add(key)
414
- result.append(line)
415
-
416
- return "\n".join(result)
417
-
418
-
419
- def _deduplicate_short_blocks(content: str) -> str:
420
- """Remove duplicate short text blocks that repeat across pages.
421
-
422
- When VLM processes each page, it may re-extract document subtitles,
423
- metadata lines, and other short repeating text. This removes exact
424
- duplicates of short non-table blocks (< 120 chars).
425
- """
426
- blocks = content.split("\n\n")
427
- seen: set[str] = set()
428
- result: list[str] = []
429
-
430
- for block in blocks:
431
- stripped = block.strip()
432
- if not stripped:
433
- result.append(block)
434
- continue
435
-
436
- # Only deduplicate short, non-table, non-heading blocks
437
- is_table = stripped.startswith("|") and "|" in stripped[1:]
438
- is_heading = stripped.startswith("#")
439
- if is_table or is_heading or len(stripped) > 120:
440
- result.append(block)
441
- continue
442
-
443
- key = stripped.lower()
444
- if key in seen:
445
- continue # Skip duplicate short block
446
-
447
- seen.add(key)
448
- result.append(block)
449
-
450
- return "\n\n".join(result)
451
-
452
-
453
- def _remove_page_boundary_artifacts(content: str) -> str:
454
- """Remove page footer/header artifacts like standalone dates, times, page numbers, and footers."""
455
- content = _STANDALONE_DATE.sub("", content)
456
- content = _STANDALONE_TIME.sub("", content)
457
- content = _PAGE_FOOTER.sub("", content)
458
- content = _STANDALONE_PAGE_NUM.sub("", content)
459
- return content
460
-
461
-
462
- def _normalize_numbered_headings(content: str) -> str:
463
- """Normalize numbered section headings to consistent ## level.
464
-
465
- VLM inconsistently formats numbered sections like "3. OCCUPANCY" β€”
466
- some get ## headings, some are plain text. This detects the pattern
467
- and ensures all numbered sections at the same level use ## headings.
468
- """
469
- lines = content.split("\n")
470
- result: list[str] = []
471
-
472
- # First pass: detect which numbered sections exist and their heading status
473
- sections_with_heading: set[int] = set()
474
- sections_without_heading: set[int] = set()
475
-
476
- for line in lines:
477
- stripped = line.strip()
478
- # Already a heading like "## 3. OCCUPANCY"
479
- heading_match = re.match(r"^#{1,3}\s+(\d{1,2})\.\s+[A-Z]", stripped)
480
- if heading_match:
481
- sections_with_heading.add(int(heading_match.group(1)))
482
- continue
483
- # Plain text like "3. OCCUPANCY. Tenant shall..."
484
- plain_match = _NUMBERED_SECTION.match(stripped)
485
- if plain_match:
486
- sections_without_heading.add(int(plain_match.group(1)))
487
-
488
- # If there's a mix of headed and non-headed numbered sections, normalize
489
- if sections_with_heading and sections_without_heading:
490
- for i, line in enumerate(lines):
491
- stripped = line.strip()
492
- # Check if this is a non-headed numbered section that should be a heading
493
- plain_match = _NUMBERED_SECTION.match(stripped)
494
- if plain_match:
495
- section_num = int(plain_match.group(1))
496
- if section_num in sections_without_heading:
497
- # Check that it looks like a section start (followed by text)
498
- # Split at the first sentence end to make the heading
499
- # Extract just "N. TITLE." as heading, keep body text
500
- title_end = plain_match.end()
501
- title = stripped[:title_end].rstrip(".")
502
- body = stripped[title_end:].strip()
503
- if body:
504
- result.append(f"## {title}")
505
- result.append(body)
506
- else:
507
- result.append(f"## {title}")
508
- continue
509
- result.append(line)
510
- else:
511
- result = lines
512
-
513
- return "\n".join(result)
514
-
515
-
516
- def _clean_table_artifacts(content: str) -> str:
517
- """Clean table formatting artifacts.
518
-
519
- - Removes table rows where ALL cells are empty
520
- - Strips trailing empty cells from table rows
521
- - Removes orphaned separator rows not preceded by a header
522
- """
523
- lines = content.split("\n")
524
- result: list[str] = []
525
-
526
- for i, line in enumerate(lines):
527
- stripped = line.strip()
528
-
529
- # Skip completely empty table rows (| | | | |)
530
- if _EMPTY_TABLE_ROW.match(stripped):
531
- continue
532
-
533
- # Clean trailing empty cells from table data rows
534
- if stripped.startswith("|") and "|" in stripped[1:]:
535
- # Don't touch separator rows
536
- if not _TABLE_SEP_ROW.match(stripped):
537
- # Remove trailing empty cells
538
- cleaned = _TRAILING_EMPTY_CELLS.sub(" |", stripped)
539
- result.append(cleaned)
540
- continue
541
-
542
- result.append(line)
543
-
544
- return "\n".join(result)
545
-
546
-
547
- def _is_table_line(line: str) -> bool:
548
- """Check if a line is a markdown table row or separator."""
549
- s = line.strip()
550
- return bool(s.startswith("|") and s.endswith("|") and s.count("|") >= 3)
551
-
552
-
553
- def _count_columns(line: str) -> int:
554
- """Count the number of columns in a table row."""
555
- s = line.strip()
556
- if not s.startswith("|"):
557
- return 0
558
- # Split by | and count non-boundary segments
559
- parts = s.split("|")
560
- # First and last are empty strings from leading/trailing |
561
- return max(0, len(parts) - 2)
562
-
563
-
564
- def _merge_split_tables(content: str) -> str:
565
- """Merge table continuations that were split across pages.
566
-
567
- Detects when non-table content (whitespace, duplicate metadata) separates
568
- what should be a single table, and merges the data rows.
569
- """
570
- lines = content.split("\n")
571
- result: list[str] = []
572
- i = 0
573
-
574
- while i < len(lines):
575
- result.append(lines[i])
576
- i += 1
577
-
578
- # Check if we just appended a table row and the next chunk looks like
579
- # a table continuation (another table with similar column count)
580
- if not _is_table_line(result[-1]):
581
- continue
582
-
583
- last_table_cols = _count_columns(result[-1])
584
- if last_table_cols < 2:
585
- continue
586
-
587
- # Look ahead past empty lines / short non-table lines
588
- j = i
589
- gap_lines: list[str] = []
590
- while j < len(lines):
591
- s = lines[j].strip()
592
- if s == "":
593
- gap_lines.append(lines[j])
594
- j += 1
595
- continue
596
- break
597
-
598
- if j >= len(lines):
599
- continue
600
-
601
- # Check if the next non-empty line starts a table
602
- if not _is_table_line(lines[j]):
603
- continue
604
-
605
- next_table_cols = _count_columns(lines[j])
606
-
607
- # If column counts are close (within 30%), it's likely a continuation
608
- if last_table_cols < 2 or next_table_cols < 2:
609
- continue
610
- ratio = min(last_table_cols, next_table_cols) / max(last_table_cols, next_table_cols)
611
- if ratio < 0.7:
612
- continue
613
-
614
- # Check if the new table starts with header + separator (indicating
615
- # the VLM re-extracted headers on the next page)
616
- has_new_header = False
617
- if _is_table_line(lines[j]):
618
- # Look for a separator row in the next 1-2 lines
619
- for k in range(j + 1, min(j + 3, len(lines))):
620
- if _TABLE_SEP_ROW.match(lines[k].strip()):
621
- has_new_header = True
622
- break
623
-
624
- if has_new_header:
625
- # Skip the gap, skip the duplicate header + separator, keep data rows
626
- # Find the separator row
627
- skip_to = j
628
- while skip_to < len(lines):
629
- if _TABLE_SEP_ROW.match(lines[skip_to].strip()):
630
- skip_to += 1 # Skip past separator
631
- break
632
- skip_to += 1
633
- i = skip_to
634
- else:
635
- # No header β€” just skip the gap and append the continuation rows
636
- i = j
637
-
638
- return "\n".join(result)
639
-
640
-
641
- def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
642
- """Send a page image to Qwen3-VL via vLLM for text extraction.
643
-
644
- Includes retry logic: on timeout/failure, retries once with longer timeout.
645
- Strips <think> reasoning tokens from Qwen3 output.
646
- """
647
- b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
648
-
649
- payload = {
650
- "model": VLM_MODEL,
651
- "messages": [
652
- {
653
- "role": "user",
654
- "content": [
655
- {
656
- "type": "image_url",
657
- "image_url": {"url": f"data:image/png;base64,{b64_image}"},
658
- },
659
- {
660
- "type": "text",
661
- "text": (
662
- "Convert this document page to markdown format.\n\n"
663
- "Rules:\n"
664
- "- Extract ALL text content exactly as written\n"
665
- "- Use ## headings for section titles\n"
666
- "- Preserve lists, paragraphs, and document structure\n"
667
- "- For tables:\n"
668
- " * Read EVERY column header exactly as printed β€” do NOT skip, rename, or reorder columns\n"
669
- " * Include ALL columns even if the table is very wide\n"
670
- " * Format as markdown tables with | delimiters and --- separator rows\n"
671
- " * Each data row must have the same number of cells as the header\n"
672
- " * NEVER use LaTeX (no \\begin{tabular}, no \\hline, no &)\n"
673
- "- NEVER wrap output in code fences (no ```)\n"
674
- "- NEVER add HTML comments or coordinate annotations\n"
675
- "- Do NOT include page headers, footers, page numbers, or timestamps that repeat on every page\n"
676
- "- For handwritten text, transcribe as accurately as possible\n"
677
- "- Output ONLY the extracted markdown content, nothing else"
678
- ),
679
- },
680
- ],
681
- }
682
- ],
683
- "max_tokens": 32768,
684
- "temperature": 0.1,
685
- # Disable Qwen3 thinking mode to avoid <think> tokens
686
- "chat_template_kwargs": {"enable_thinking": False},
687
- }
688
-
689
- url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
690
-
691
- # Try with primary timeout, then retry once with extended timeout
692
- for attempt, timeout in enumerate([VLM_TIMEOUT, VLM_TIMEOUT * 1.5], start=1):
693
- try:
694
- response = httpx.post(url, json=payload, timeout=timeout)
695
- if response.status_code != 200:
696
- try:
697
- err = response.json()
698
- msg = err.get("message", err.get("detail", str(err)[:300]))
699
- except Exception:
700
- msg = response.text[:300]
701
- logger.error(f"[{request_id}] vLLM error ({response.status_code}) page {page_no}: {msg}")
702
- if attempt == 1:
703
- logger.info(f"[{request_id}] Retrying page {page_no}...")
704
- continue
705
- response.raise_for_status()
706
-
707
- result = response.json()
708
- choices = result.get("choices")
709
- if not choices:
710
- raise ValueError("vLLM returned no choices")
711
- content = choices[0].get("message", {}).get("content")
712
- if content is None:
713
- raise ValueError("vLLM response missing content")
714
-
715
- # Clean VLM output (strip think blocks, code fences, HTML comments, convert LaTeX tables)
716
- content = _clean_vlm_output(content)
717
-
718
- return content
719
-
720
- except (httpx.TimeoutException, httpx.ConnectError) as e:
721
- if attempt == 1:
722
- logger.warning(
723
- f"[{request_id}] VLM attempt {attempt} failed on page {page_no}: {e}. Retrying..."
724
- )
725
- continue
726
- raise
727
-
728
- raise RuntimeError(f"VLM failed after 2 attempts on page {page_no}")
729
-
730
-
731
- def _vlm_extract_tables(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> Optional[str]:
732
- """Send a page image to VLM with a table-focused prompt for better table extraction.
733
-
734
- Used as a second pass on pages where tables were detected in the first pass.
735
- Returns extracted tables as markdown, or None on failure.
736
- """
737
- b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
738
-
739
- payload = {
740
- "model": VLM_MODEL,
741
- "messages": [
742
- {
743
- "role": "user",
744
- "content": [
745
- {
746
- "type": "image_url",
747
- "image_url": {"url": f"data:image/png;base64,{b64_image}"},
748
- },
749
- {
750
- "type": "text",
751
- "text": (
752
- "Extract ONLY the tables from this document page as markdown.\n\n"
753
- "Rules:\n"
754
- "- Read every column header EXACTLY as printed on the page\n"
755
- "- Include ALL columns β€” do NOT skip any, even if the table is very wide\n"
756
- "- Each data row must have the same number of | cells as the header row\n"
757
- "- Use | delimiters and --- separator rows\n"
758
- "- Preserve all numbers, text, and formatting exactly\n"
759
- "- Add spaces between words β€” never concatenate (e.g., 'CAP Rate' not 'CAPRate')\n"
760
- "- If multiple tables exist, separate them with a blank line\n"
761
- "- Include a short heading (## or ###) before each table if one is visible\n"
762
- "- NEVER use LaTeX table syntax\n"
763
- "- Output ONLY the markdown tables, nothing else"
764
- ),
765
- },
766
- ],
767
- }
768
- ],
769
- "max_tokens": 32768,
770
- "temperature": 0.1,
771
- "chat_template_kwargs": {"enable_thinking": False},
772
- }
773
-
774
- url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
775
-
776
- try:
777
- response = httpx.post(url, json=payload, timeout=VLM_TIMEOUT)
778
- if response.status_code != 200:
779
- logger.warning(f"[{request_id}] Table re-prompt failed for page {page_no}: {response.status_code}")
780
- return None
781
-
782
- result = response.json()
783
- choices = result.get("choices")
784
- if not choices:
785
- return None
786
- content = choices[0].get("message", {}).get("content")
787
- if content is None:
788
- return None
789
-
790
- content = _clean_vlm_output(content)
791
- return content if content.strip() else None
792
-
793
- except Exception as e:
794
- logger.warning(f"[{request_id}] Table re-prompt error for page {page_no}: {e}")
795
- return None
796
-
797
-
798
- # ---------------------------------------------------------------------------
799
- # Table Detection from VLM Output
800
- # ---------------------------------------------------------------------------
801
-
802
- # Markdown table separator: | --- | --- | or |:---:|---:|
803
- _MD_TABLE_SEPARATOR = re.compile(
804
- r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE
805
  )
806
-
807
- # LaTeX table markers (fallback if VLM ignores markdown instruction)
808
- _LATEX_TABLE_PATTERN = re.compile(r"\\begin\{tabular\}")
809
-
810
-
811
- def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
812
- """Detect pages containing tables from VLM markdown output.
813
-
814
- Checks for both markdown table separators and LaTeX tabular markers.
815
- """
816
- table_pages: set[int] = set()
817
- for page_no, text in vlm_page_texts.items():
818
- if text and (
819
- _MD_TABLE_SEPARATOR.search(text) or _LATEX_TABLE_PATTERN.search(text)
820
- ):
821
- table_pages.add(page_no)
822
- return table_pages
823
-
824
-
825
- # ---------------------------------------------------------------------------
826
- # Gemini API: Table Page Extraction
827
- # ---------------------------------------------------------------------------
828
-
829
-
830
- def _gemini_extract_page(
831
- page_image_bytes: bytes, request_id: str = "", page_no: int = 0
832
- ) -> Optional[str]:
833
- """Send a page image to Gemini 2.5 Flash for high-quality extraction.
834
-
835
- Used for table pages where VLM output is insufficient.
836
- Returns the full page markdown (text + tables), or None on failure.
837
- """
838
- if not GEMINI_API_KEY:
839
- logger.warning(f"[{request_id}] GEMINI_API_KEY not set β€” skipping Gemini extraction")
840
- return None
841
-
842
- b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
843
-
844
- payload = {
845
- "contents": [
846
- {
847
- "parts": [
848
- {
849
- "inline_data": {
850
- "mime_type": "image/png",
851
- "data": b64_image,
852
- }
853
- },
854
- {
855
- "text": (
856
- "Convert this document page to clean markdown format.\n\n"
857
- "Rules:\n"
858
- "- Extract ALL text content exactly as written\n"
859
- "- Use ## headings for section titles\n"
860
- "- Preserve lists, paragraphs, and document structure\n"
861
- "- For tables:\n"
862
- " * Read EVERY column header exactly as printed\n"
863
- " * Include ALL columns even if the table is very wide\n"
864
- " * Format as markdown tables with | delimiters and --- separator rows\n"
865
- " * Each data row must have the same number of cells as the header\n"
866
- " * Preserve multi-line cell content on separate lines within the cell\n"
867
- "- Do NOT wrap output in code fences\n"
868
- "- Do NOT add image descriptions or [Image:] tags\n"
869
- "- Do NOT include page headers, footers, or page numbers\n"
870
- "- Output ONLY the extracted markdown content"
871
- ),
872
- },
873
- ],
874
- }
875
- ],
876
- "generationConfig": {
877
- "temperature": 0.1,
878
- "maxOutputTokens": 32768,
879
- },
880
- }
881
-
882
- url = (
883
- f"https://generativelanguage.googleapis.com/v1beta/models/"
884
- f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
885
- )
886
-
887
- for attempt in range(1, 3):
888
- try:
889
- timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
890
- response = httpx.post(url, json=payload, timeout=timeout)
891
-
892
- if response.status_code == 429:
893
- # Rate limited β€” wait and retry
894
- logger.warning(
895
- f"[{request_id}] Gemini rate limited on page {page_no + 1}, "
896
- f"attempt {attempt}. Waiting 5s..."
897
- )
898
- time.sleep(5)
899
- continue
900
-
901
- if response.status_code != 200:
902
- try:
903
- err = response.json()
904
- msg = str(err.get("error", {}).get("message", str(err)[:300]))
905
- except Exception:
906
- msg = response.text[:300]
907
- logger.error(
908
- f"[{request_id}] Gemini error ({response.status_code}) "
909
- f"page {page_no + 1}: {msg}"
910
- )
911
- if attempt == 1:
912
- continue
913
- return None
914
-
915
- result = response.json()
916
- candidates = result.get("candidates", [])
917
- if not candidates:
918
- logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
919
- return None
920
-
921
- parts = candidates[0].get("content", {}).get("parts", [])
922
- if not parts:
923
- return None
924
-
925
- content = parts[0].get("text", "")
926
-
927
- # Clean up: strip code fences if Gemini wraps output
928
- content = _CODE_FENCE_PATTERN.sub("", content)
929
- content = _CODE_FENCE_END.sub("", content)
930
-
931
- return content.strip() if content.strip() else None
932
-
933
- except (httpx.TimeoutException, httpx.ConnectError) as e:
934
- if attempt == 1:
935
- logger.warning(
936
- f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}. Retrying..."
937
- )
938
- continue
939
- logger.error(f"[{request_id}] Gemini failed after 2 attempts on page {page_no + 1}: {e}")
940
- return None
941
-
942
- return None
943
-
944
-
945
- # ---------------------------------------------------------------------------
946
- # Mini-PDF Extraction (pypdf) β€” kept for fallback Docling path
947
- # ---------------------------------------------------------------------------
948
-
949
-
950
- def _extract_pages_to_pdf(
951
- input_path: Path, page_numbers: list[int], request_id: str
952
- ) -> tuple[Path, dict[int, int]]:
953
- """Extract specific pages from a PDF into a mini-PDF using pypdf.
954
-
955
- Args:
956
- input_path: Path to the original PDF
957
- page_numbers: 0-indexed page numbers to extract
958
- request_id: Request ID for logging
959
-
960
- Returns:
961
- (mini_pdf_path, page_map) where page_map maps Docling 1-indexed
962
- page numbers in the mini-PDF back to 0-indexed original page numbers.
963
- """
964
- from pypdf import PdfReader, PdfWriter
965
-
966
- reader = PdfReader(str(input_path))
967
- writer = PdfWriter()
968
-
969
- # page_map: {docling_page_no (1-indexed in mini-PDF) β†’ original_page_no (0-indexed)}
970
- page_map: dict[int, int] = {}
971
-
972
- for idx, orig_page in enumerate(sorted(page_numbers)):
973
- if orig_page < len(reader.pages):
974
- writer.add_page(reader.pages[orig_page])
975
- page_map[idx + 1] = orig_page # Docling uses 1-indexed pages
976
- else:
977
- logger.warning(
978
- f"[{request_id}] Page {orig_page} out of range (total: {len(reader.pages)})"
979
- )
980
-
981
- mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
982
- with open(mini_pdf_path, "wb") as f:
983
- writer.write(f)
984
-
985
- logger.info(
986
- f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original"
987
- )
988
- return mini_pdf_path, page_map
989
-
990
-
991
- # ---------------------------------------------------------------------------
992
- # Table Extraction from Docling
993
- # ---------------------------------------------------------------------------
994
-
995
-
996
- def _extract_table_markdowns(doc, page_map: dict[int, int]) -> dict[int, list[str]]:
997
- """Extract table markdown from Docling document, keyed by ORIGINAL page number.
998
-
999
- Uses page_map to translate from Docling's 1-indexed mini-PDF pages
1000
- back to the original 0-indexed page numbers.
1001
- """
1002
- tables_by_page: dict[int, list[str]] = {}
1003
- for element, _ in doc.iterate_items():
1004
- if isinstance(element, TableItem):
1005
- docling_page = element.prov[0].page_no if element.prov else -1
1006
- # Translate mini-PDF page β†’ original page
1007
- orig_page = page_map.get(docling_page, docling_page - 1)
1008
- table_md = element.export_to_markdown(doc=doc)
1009
- if orig_page not in tables_by_page:
1010
- tables_by_page[orig_page] = []
1011
- tables_by_page[orig_page].append(table_md)
1012
- return tables_by_page
1013
-
1014
-
1015
- def _extract_docling_page_markdown(doc, page_map: dict[int, int]) -> dict[int, str]:
1016
- """Extract complete per-page markdown from Docling document.
1017
-
1018
- Returns dict mapping ORIGINAL page numbers (0-indexed) to complete markdown
1019
- content including text, headings, and tables as Docling understands them.
1020
- This is used as the PRIMARY output for table pages, replacing the VLM text
1021
- entirely for better table structure.
1022
- """
1023
- pages: dict[int, list[str]] = {}
1024
-
1025
- for element, _ in doc.iterate_items():
1026
- if not element.prov:
1027
- continue
1028
- docling_page = element.prov[0].page_no
1029
- orig_page = page_map.get(docling_page, docling_page - 1)
1030
-
1031
- md = element.export_to_markdown(doc=doc)
1032
- if md and md.strip():
1033
- if orig_page not in pages:
1034
- pages[orig_page] = []
1035
- pages[orig_page].append(md)
1036
-
1037
- return {pg: "\n\n".join(parts) for pg, parts in pages.items()}
1038
-
1039
-
1040
- # ---------------------------------------------------------------------------
1041
- # Merge: VLM Text + TableFormer Tables
1042
- # ---------------------------------------------------------------------------
1043
-
1044
- # Consecutive lines with | delimiters (markdown tables)
1045
- _VLM_TABLE_BLOCK = re.compile(r"((?:^\|[^\n]+\|$\n?)+)", re.MULTILINE)
1046
-
1047
- # LaTeX table blocks
1048
- _VLM_LATEX_BLOCK = re.compile(
1049
- r"(\\begin\{tabular\}.*?\\end\{tabular\})", re.DOTALL
1050
  )
1051
 
1052
 
1053
- def _extract_table_blocks(text: str) -> list[str]:
1054
- """Extract individual table blocks from markdown text.
1055
-
1056
- Returns a list of table block strings (header + separator + data rows).
1057
- """
1058
- tables: list[str] = []
1059
- md_matches = list(_VLM_TABLE_BLOCK.finditer(text))
1060
- latex_matches = list(_VLM_LATEX_BLOCK.finditer(text))
1061
-
1062
- # Combine and deduplicate by position
1063
- all_matches = [(m.start(), m.end(), m.group(0)) for m in md_matches]
1064
- all_matches += [(m.start(), m.end(), m.group(0)) for m in latex_matches]
1065
- all_matches.sort(key=lambda x: x[0])
1066
-
1067
- last_end = -1
1068
- for start, end, content in all_matches:
1069
- if start >= last_end:
1070
- tables.append(content.strip())
1071
- last_end = end
1072
-
1073
- return tables
1074
-
1075
-
1076
- def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list[str]) -> str:
1077
- """Replace VLM's table sections with more accurate tables.
1078
-
1079
- Handles both markdown pipe tables and LaTeX tabular blocks in VLM output.
1080
- Used for both TableFormer tables (Pass 2) and re-prompted VLM tables (Pass 1.5).
1081
- """
1082
- if not table_markdowns:
1083
- return vlm_text
1084
-
1085
- # Find all table blocks (markdown first, then LaTeX)
1086
- md_tables = list(_VLM_TABLE_BLOCK.finditer(vlm_text))
1087
- latex_tables = list(_VLM_LATEX_BLOCK.finditer(vlm_text))
1088
-
1089
- # Combine and sort all table positions
1090
- all_tables = [(m.start(), m.end(), "md") for m in md_tables]
1091
- all_tables += [(m.start(), m.end(), "latex") for m in latex_tables]
1092
- all_tables.sort(key=lambda x: x[0])
1093
-
1094
- # Remove overlapping matches (prefer earlier match)
1095
- filtered: list[tuple[int, int, str]] = []
1096
- last_end = -1
1097
- for start, end, kind in all_tables:
1098
- if start >= last_end:
1099
- filtered.append((start, end, kind))
1100
- last_end = end
1101
-
1102
- vlm_table_count = len(filtered)
1103
- tf_table_count = len(table_markdowns)
1104
-
1105
- if vlm_table_count != tf_table_count:
1106
- logger.warning(
1107
- f"Table count mismatch: VLM={vlm_table_count}, TableFormer={tf_table_count}. "
1108
- f"Using positional replacement for min({vlm_table_count}, {tf_table_count}) tables."
1109
- )
1110
-
1111
- # Replace VLM tables with TableFormer tables (positional)
1112
- result_parts: list[str] = []
1113
- prev_end = 0
1114
- table_idx = 0
1115
-
1116
- for start, end, kind in filtered:
1117
- result_parts.append(vlm_text[prev_end:start])
1118
- if table_idx < tf_table_count:
1119
- result_parts.append(table_markdowns[table_idx].strip() + "\n")
1120
- table_idx += 1
1121
- else:
1122
- # More VLM tables than TableFormer β€” keep VLM version
1123
- result_parts.append(vlm_text[start:end])
1124
- prev_end = end
1125
-
1126
- result_parts.append(vlm_text[prev_end:])
1127
-
1128
- # If there are remaining TableFormer tables not matched, append them
1129
- while table_idx < tf_table_count:
1130
- result_parts.append("\n\n" + table_markdowns[table_idx].strip() + "\n")
1131
- table_idx += 1
1132
-
1133
- return "".join(result_parts)
1134
-
1135
-
1136
- # ---------------------------------------------------------------------------
1137
- # PDF to Page Images (parallel, optimized)
1138
- # ---------------------------------------------------------------------------
1139
-
1140
-
1141
- def _render_single_page(
1142
- input_path: Path, page_idx: int, dpi: int
1143
- ) -> tuple[int, Optional[bytes]]:
1144
- """Render a single PDF page to PNG bytes with CLAHE preprocessing.
1145
-
1146
- Returns (page_idx, png_bytes) or (page_idx, None) on failure.
1147
- """
1148
- try:
1149
- images = convert_from_path(
1150
- str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
1151
- )
1152
- if not images:
1153
- return page_idx, None
1154
-
1155
- img = images[0]
1156
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
1157
- tmp_path = tmp.name
1158
- img.save(tmp_path, format="PNG")
1159
-
1160
- try:
1161
- _preprocess_image_for_ocr(tmp_path)
1162
- with open(tmp_path, "rb") as f:
1163
- return page_idx, f.read()
1164
- finally:
1165
- os.unlink(tmp_path)
1166
- except Exception as e:
1167
- logger.warning(f"Failed to render page {page_idx + 1}: {e}")
1168
- return page_idx, None
1169
-
1170
-
1171
- def _pdf_to_page_images(
1172
- input_path: Path,
1173
- request_id: str,
1174
- start_page: int = 0,
1175
- end_page: Optional[int] = None,
1176
- ) -> list[tuple[int, bytes]]:
1177
- """Convert PDF pages to PNG image bytes using parallel rendering.
1178
-
1179
- Uses ThreadPoolExecutor for concurrent page rendering.
1180
- Returns list of (page_no, png_bytes) tuples, sorted by page number.
1181
- """
1182
- try:
1183
- from pdf2image.pdf2image import pdfinfo_from_path
1184
-
1185
- info = pdfinfo_from_path(str(input_path))
1186
- total_pages = info["Pages"]
1187
- last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
1188
- except Exception as e:
1189
- logger.warning(f"[{request_id}] Could not get PDF info: {e}")
1190
- return []
1191
-
1192
- page_indices = list(range(start_page, last_page))
1193
-
1194
- start_time = time.time()
1195
- page_images: list[tuple[int, bytes]] = []
1196
-
1197
- # Render pages in parallel (4 threads β€” I/O bound, not CPU bound for poppler)
1198
- with ThreadPoolExecutor(max_workers=4) as executor:
1199
- futures = {
1200
- executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
1201
- for idx in page_indices
1202
- }
1203
- for future in as_completed(futures):
1204
- page_idx, png_bytes = future.result()
1205
- if png_bytes is not None:
1206
- page_images.append((page_idx, png_bytes))
1207
-
1208
- page_images.sort(key=lambda x: x[0])
1209
- render_time = time.time() - start_time
1210
- logger.info(
1211
- f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
1212
- f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
1213
- )
1214
- return page_images
1215
-
1216
-
1217
- # ---------------------------------------------------------------------------
1218
- # Docling Converter (for TableFormer only)
1219
- # ---------------------------------------------------------------------------
1220
-
1221
-
1222
- def _create_converter(images_scale: float = 2.0) -> DocumentConverter:
1223
- """Create a Docling converter with Standard Pipeline.
1224
-
1225
- Used ONLY for TableFormer on table pages (not for full document OCR).
1226
- """
1227
- device = _get_device()
1228
- logger.info(f"Creating converter with device: {device}")
1229
-
1230
- pipeline_options = PdfPipelineOptions()
1231
- pipeline_options.do_ocr = True
1232
- pipeline_options.do_table_structure = True
1233
- pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
1234
- pipeline_options.table_structure_options.do_cell_matching = True
1235
-
1236
- pipeline_options.ocr_options = RapidOcrOptions()
1237
- pipeline_options.ocr_options.force_full_page_ocr = True
1238
-
1239
- pipeline_options.generate_page_images = True
1240
- pipeline_options.images_scale = images_scale
1241
- pipeline_options.generate_picture_images = True
1242
-
1243
- pipeline_options.accelerator_options = AcceleratorOptions(
1244
- device=device,
1245
- num_threads=0 if device == "cuda" else 4,
1246
- )
1247
-
1248
- converter = DocumentConverter(
1249
- format_options={
1250
- InputFormat.PDF: PdfFormatOption(
1251
- pipeline_options=pipeline_options,
1252
- backend=DoclingParseV4DocumentBackend,
1253
- )
1254
- }
1255
- )
1256
- return converter
1257
-
1258
-
1259
- def _get_converter() -> DocumentConverter:
1260
- """Get or create the global converter instance."""
1261
- global _converter
1262
- if _converter is None:
1263
- _converter = _create_converter(images_scale=IMAGES_SCALE)
1264
- return _converter
1265
-
1266
-
1267
- # ---------------------------------------------------------------------------
1268
- # VLM-First Conversion (Pass 1: VLM, Pass 2: TableFormer, Merge)
1269
- # ---------------------------------------------------------------------------
1270
-
1271
-
1272
- def _convert_document(
1273
- input_path: Path,
1274
- output_dir: Path,
1275
- images_scale: float,
1276
- include_images: bool,
1277
- request_id: str,
1278
- start_page: int = 0,
1279
- end_page: Optional[int] = None,
1280
- ) -> tuple:
1281
- """
1282
- VLM-first hybrid conversion.
1283
-
1284
- Pass 1 (GPU): VLM OCR on ALL pages (fast, concurrent)
1285
- Detect: Find table pages from VLM markdown output
1286
- Pass 2 (CPU): Docling TableFormer ONLY on table pages (mini-PDF)
1287
- Merge: VLM text for all pages + TableFormer tables
1288
-
1289
- Returns: (markdown_content, json_content, pages_processed, image_count)
1290
- """
1291
- overall_start = time.time()
1292
-
1293
- # ---- RENDER ALL PAGES ----
1294
- page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
1295
-
1296
- if not page_images:
1297
- logger.warning(f"[{request_id}] No page images β€” falling back to full Docling pipeline")
1298
- return _convert_document_full_docling(
1299
- input_path, output_dir, images_scale, include_images, request_id
1300
- )
1301
-
1302
- render_time = time.time() - overall_start
1303
-
1304
- # ---- PASS 1: VLM OCR ALL PAGES (GPU, concurrent) ----
1305
- logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
1306
- logger.info(f"[{request_id}] Sending {len(page_images)} pages to VLM ({VLM_CONCURRENCY} concurrent)")
1307
-
1308
- vlm_page_texts: dict[int, Optional[str]] = {}
1309
- vlm_start = time.time()
1310
-
1311
- with ThreadPoolExecutor(max_workers=VLM_CONCURRENCY) as executor:
1312
- future_to_page = {
1313
- executor.submit(_vlm_ocr_page, page_bytes, request_id, page_no + 1): page_no
1314
- for page_no, page_bytes in page_images
1315
- }
1316
- for future in as_completed(future_to_page):
1317
- page_no = future_to_page[future]
1318
- try:
1319
- vlm_text = future.result()
1320
- vlm_page_texts[page_no] = vlm_text
1321
- logger.info(
1322
- f"[{request_id}] VLM processed page {page_no + 1} ({len(vlm_text)} chars)"
1323
- )
1324
- except Exception as e:
1325
- logger.warning(f"[{request_id}] VLM failed on page {page_no + 1}: {e}")
1326
- vlm_page_texts[page_no] = None
1327
-
1328
- vlm_time = time.time() - vlm_start
1329
- logger.info(f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)")
1330
-
1331
- # ---- DETECT TABLE PAGES ----
1332
- table_pages = _detect_table_pages(vlm_page_texts)
1333
-
1334
- if table_pages:
1335
- logger.info(
1336
- f"[{request_id}] Tables detected on {len(table_pages)} pages: "
1337
- f"{sorted(p + 1 for p in table_pages)}"
1338
- )
1339
- else:
1340
- logger.info(f"[{request_id}] No tables detected β€” skipping table re-prompting")
1341
-
1342
- # ---- PASS 2: GEMINI 2.5 FLASH ON TABLE PAGES ----
1343
- gemini_page_texts: dict[int, str] = {}
1344
- gemini_time = 0.0
1345
-
1346
- if table_pages and GEMINI_API_KEY:
1347
- logger.info(
1348
- f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(table_pages)} table pages"
1349
- )
1350
- gemini_start = time.time()
1351
-
1352
- # Build lookup: page_no β†’ image bytes
1353
- page_image_map = {pno: pbytes for pno, pbytes in page_images}
1354
-
1355
- with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
1356
- future_to_page = {
1357
- executor.submit(
1358
- _gemini_extract_page,
1359
- page_image_map[page_no],
1360
- request_id,
1361
- page_no,
1362
- ): page_no
1363
- for page_no in sorted(table_pages)
1364
- if page_no in page_image_map
1365
- }
1366
- for future in as_completed(future_to_page):
1367
- page_no = future_to_page[future]
1368
- try:
1369
- gemini_text = future.result()
1370
- if gemini_text:
1371
- gemini_page_texts[page_no] = gemini_text
1372
- logger.info(
1373
- f"[{request_id}] Gemini processed page {page_no + 1} "
1374
- f"({len(gemini_text)} chars)"
1375
- )
1376
- else:
1377
- logger.warning(
1378
- f"[{request_id}] Gemini returned empty for page {page_no + 1} "
1379
- f"β€” falling back to VLM"
1380
- )
1381
- except Exception as e:
1382
- logger.warning(
1383
- f"[{request_id}] Gemini failed on page {page_no + 1}: {e} "
1384
- f"β€” falling back to VLM"
1385
- )
1386
-
1387
- gemini_time = time.time() - gemini_start
1388
- logger.info(
1389
- f"[{request_id}] Pass 2 completed in {gemini_time:.2f}s β€” "
1390
- f"{len(gemini_page_texts)}/{len(table_pages)} table pages extracted via Gemini"
1391
- )
1392
- elif table_pages and not GEMINI_API_KEY:
1393
- logger.warning(
1394
- f"[{request_id}] GEMINI_API_KEY not set β€” table pages will use VLM output only"
1395
- )
1396
-
1397
- # ---- MERGE: VLM TEXT (non-table pages) + GEMINI (table pages) ----
1398
- md_parts: list[str] = []
1399
- image_count = 0
1400
-
1401
- for page_no in sorted(vlm_page_texts.keys()):
1402
- if md_parts:
1403
- md_parts.append("\n\n")
1404
-
1405
- if page_no in gemini_page_texts:
1406
- # Table page β€” use Gemini's superior output
1407
- md_parts.append(gemini_page_texts[page_no])
1408
- elif vlm_page_texts[page_no] is not None:
1409
- # Non-table page or Gemini fallback β€” use VLM output
1410
- md_parts.append(vlm_page_texts[page_no])
1411
- else:
1412
- md_parts.append(f"[Page {page_no + 1}: extraction failed]\n\n")
1413
-
1414
- markdown_content = "".join(md_parts)
1415
-
1416
- # Post-process: fix cross-page artifacts, deduplicate headers, clean tables
1417
- if len(vlm_page_texts) > 1:
1418
- markdown_content = _post_process_merged_markdown(markdown_content)
1419
-
1420
- pages_processed = len(vlm_page_texts)
1421
- total_time = time.time() - overall_start
1422
-
1423
- logger.info(
1424
- f"[{request_id}] VLM+Gemini conversion complete: {pages_processed} pages β€” "
1425
- f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
1426
- f"Gemini {gemini_time:.1f}s = {total_time:.2f}s total"
1427
- )
1428
- if pages_processed > 0:
1429
- logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
1430
-
1431
- return markdown_content, None, pages_processed, image_count
1432
-
1433
-
1434
- def _convert_document_full_docling(
1435
- input_path: Path,
1436
- output_dir: Path,
1437
- images_scale: float,
1438
- include_images: bool,
1439
- request_id: str,
1440
- ) -> tuple:
1441
- """Fallback: full Docling pipeline when page images are unavailable."""
1442
- logger.info(f"[{request_id}] Fallback: running full Docling pipeline")
1443
- converter = _get_converter()
1444
-
1445
- start_time = time.time()
1446
- result = converter.convert(input_path)
1447
- doc = result.document
1448
- if doc is None:
1449
- raise ValueError("Docling failed to parse document")
1450
-
1451
- elapsed = time.time() - start_time
1452
- logger.info(f"[{request_id}] Full Docling pipeline completed in {elapsed:.2f}s")
1453
-
1454
- markdown_content = doc.export_to_markdown()
1455
- pages_processed = len(
1456
- set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
1457
- )
1458
-
1459
- image_count = 0
1460
- if include_images:
1461
- image_dir = output_dir / "images"
1462
- image_dir.mkdir(parents=True, exist_ok=True)
1463
- for element, _ in doc.iterate_items():
1464
- if isinstance(element, PictureItem):
1465
- if element.image and element.image.pil_image:
1466
- pg = element.prov[0].page_no if element.prov else 0
1467
- image_id = element.self_ref.split("/")[-1]
1468
- image_name = f"page_{pg}_{image_id}.png"
1469
- image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
1470
- image_path = image_dir / image_name
1471
- try:
1472
- element.image.pil_image.save(image_path, format="PNG")
1473
- image_count += 1
1474
- except Exception:
1475
- pass
1476
-
1477
- return markdown_content, None, pages_processed, image_count
1478
-
1479
-
1480
- # ---------------------------------------------------------------------------
1481
- # Images Zip Helper
1482
- # ---------------------------------------------------------------------------
1483
-
1484
-
1485
- def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
1486
- """Create a zip file from extracted images."""
1487
- image_dir = output_dir / "images"
1488
- if not image_dir.exists():
1489
- return None, 0
1490
-
1491
- image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
1492
- zip_buffer = io.BytesIO()
1493
- image_count = 0
1494
-
1495
- with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
1496
- for img_path in image_dir.glob("*"):
1497
- if img_path.is_file() and img_path.suffix.lower() in image_extensions:
1498
- try:
1499
- zf.write(img_path, f"images/{img_path.name}")
1500
- image_count += 1
1501
- except Exception as e:
1502
- logger.warning(f"Failed to add image {img_path} to zip: {e}")
1503
-
1504
- if image_count == 0:
1505
- return None, 0
1506
-
1507
- return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
1508
-
1509
-
1510
  # ---------------------------------------------------------------------------
1511
  # Application Lifespan
1512
  # ---------------------------------------------------------------------------
@@ -1514,23 +56,13 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
1514
 
1515
  @asynccontextmanager
1516
  async def lifespan(app: FastAPI):
1517
- """Startup: initialize Docling converter and check vLLM."""
1518
  logger.info("=" * 60)
1519
- logger.info("Starting Docling VLM Parser API v4.0.0...")
1520
-
1521
- device = _get_device()
1522
- logger.info(f"Device: {device}")
1523
-
1524
- if device == "cuda":
1525
- logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
1526
- logger.info(f"CUDA Version: {torch.version.cuda}")
1527
- logger.info(
1528
- f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB"
1529
- )
1530
 
1531
- logger.info(f"VLM Model: {VLM_MODEL}")
1532
- logger.info(f"VLM Endpoint: http://{VLM_HOST}:{VLM_PORT}")
1533
- logger.info(f"VLM Timeout: {VLM_TIMEOUT}s, Concurrency: {VLM_CONCURRENCY}")
1534
  logger.info(f"Render DPI: {RENDER_DPI}")
1535
  logger.info(f"Images scale: {IMAGES_SCALE}")
1536
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
@@ -1538,27 +70,8 @@ async def lifespan(app: FastAPI):
1538
  logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
1539
  logger.info(f"Gemini Timeout: {GEMINI_TIMEOUT}s, Concurrency: {GEMINI_CONCURRENCY}")
1540
 
1541
- # Verify vLLM is running
1542
- logger.info("Checking vLLM server...")
1543
- try:
1544
- async with httpx.AsyncClient(timeout=10) as client:
1545
- resp = await client.get(f"http://{VLM_HOST}:{VLM_PORT}/health")
1546
- resp.raise_for_status()
1547
- logger.info("vLLM server is healthy")
1548
- except Exception as e:
1549
- logger.error(f"vLLM server not available: {e}")
1550
- raise RuntimeError(f"vLLM server not available at {VLM_HOST}:{VLM_PORT}")
1551
-
1552
- # Pre-initialize Docling converter
1553
- logger.info("Pre-loading Docling models (DocLayNet + TableFormer + RapidOCR)...")
1554
- try:
1555
- _get_converter()
1556
- logger.info("Docling models loaded successfully")
1557
- except Exception as e:
1558
- logger.warning(f"Failed to pre-load Docling models: {e}")
1559
-
1560
  logger.info("=" * 60)
1561
- logger.info("Docling VLM Parser API ready (VLM + Gemini hybrid: Qwen3-VL + Gemini tables)")
1562
  logger.info("=" * 60)
1563
  yield
1564
  logger.info("Shutting down Docling VLM Parser API...")
@@ -1570,8 +83,8 @@ async def lifespan(app: FastAPI):
1570
 
1571
  app = FastAPI(
1572
  title="Docling VLM Parser API",
1573
- description="VLM + Gemini hybrid parser: Qwen3-VL text + Gemini 3 Flash tables",
1574
- version="4.0.0",
1575
  lifespan=lifespan,
1576
  )
1577
 
@@ -1584,23 +97,11 @@ app = FastAPI(
1584
  @app.get("/", response_model=HealthResponse)
1585
  async def health_check() -> HealthResponse:
1586
  """Health check endpoint."""
1587
- device = _get_device()
1588
-
1589
- vlm_status = "unknown"
1590
- try:
1591
- async with httpx.AsyncClient(timeout=5) as client:
1592
- resp = await client.get(f"http://{VLM_HOST}:{VLM_PORT}/health")
1593
- vlm_status = "healthy" if resp.status_code == 200 else "unhealthy"
1594
- except Exception:
1595
- vlm_status = "unreachable"
1596
-
1597
  return HealthResponse(
1598
  status="healthy",
1599
- version="4.0.0",
1600
- device=device,
1601
- gpu_name=None,
1602
- vlm_model=f"active (gemini: {'configured' if GEMINI_API_KEY else 'not set'})",
1603
- vlm_status=vlm_status,
1604
  images_scale=IMAGES_SCALE,
1605
  )
1606
 
@@ -1615,7 +116,7 @@ async def parse_document(
1615
  include_images: bool = Form(default=False, description="Include extracted images"),
1616
  _token: str = Depends(verify_token),
1617
  ) -> ParseResponse:
1618
- """Parse a document file using VLM-first hybrid pipeline."""
1619
  request_id = str(uuid4())[:8]
1620
  start_time = time.time()
1621
 
@@ -1654,9 +155,7 @@ async def parse_document(
1654
  detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
1655
  )
1656
 
1657
- use_images_scale = images_scale if images_scale is not None else IMAGES_SCALE
1658
-
1659
- logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
1660
  logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
1661
 
1662
  temp_dir = tempfile.mkdtemp()
@@ -1672,7 +171,6 @@ async def parse_document(
1672
  _convert_document,
1673
  input_path,
1674
  output_dir,
1675
- use_images_scale,
1676
  include_images,
1677
  request_id,
1678
  start_page,
@@ -1699,8 +197,8 @@ async def parse_document(
1699
  images_zip=images_zip,
1700
  image_count=image_count,
1701
  pages_processed=pages_processed,
1702
- device_used=_get_device(),
1703
- vlm_model=VLM_MODEL,
1704
  )
1705
 
1706
  except Exception as e:
@@ -1722,7 +220,7 @@ async def parse_document_from_url(
1722
  request: URLParseRequest,
1723
  _token: str = Depends(verify_token),
1724
  ) -> ParseResponse:
1725
- """Parse a document from a URL using VLM-first hybrid pipeline."""
1726
  request_id = str(uuid4())[:8]
1727
  start_time = time.time()
1728
 
@@ -1782,9 +280,7 @@ async def parse_document_from_url(
1782
  output_dir = Path(temp_dir) / "output"
1783
  output_dir.mkdir(exist_ok=True)
1784
 
1785
- use_images_scale = request.images_scale if request.images_scale is not None else IMAGES_SCALE
1786
-
1787
- logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
1788
  logger.info(
1789
  f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
1790
  )
@@ -1793,7 +289,6 @@ async def parse_document_from_url(
1793
  _convert_document,
1794
  input_path,
1795
  output_dir,
1796
- use_images_scale,
1797
  request.include_images,
1798
  request_id,
1799
  request.start_page,
@@ -1820,8 +315,8 @@ async def parse_document_from_url(
1820
  images_zip=images_zip,
1821
  image_count=image_count,
1822
  pages_processed=pages_processed,
1823
- device_used=_get_device(),
1824
- vlm_model=VLM_MODEL,
1825
  )
1826
 
1827
  except httpx.HTTPError as e:
 
1
  """
2
+ Docling VLM Parser API v5.0.0
3
+
4
+ A FastAPI service using a PaddleOCR-VL-1.5 + Gemini hybrid architecture for document parsing:
5
+ Pass 1 (GPU): PaddleOCR-VL-1.5 on full PDF (native document parsing, 0.9B params)
6
+ Pass 2 (API): Gemini 3 Flash on table pages only (highest quality tables)
7
+ Post: Cross-page artifact removal, table cleanup, deduplication, footer removal
8
+
9
+ v5.0.0 β€” PaddleOCR-VL-1.5 + Gemini hybrid:
10
+ - Core: PaddleOCR-VL-1.5 replaces Qwen3-VL + Docling entirely
11
+ - Quality: Gemini 3 Flash used ONLY for pages with tables (better table accuracy)
12
+ - Speed: PaddleOCR handles PDF natively β€” no separate image rendering for OCR
13
+ - GPU: Runs on T4 (16GB VRAM) β€” much smaller than A100 requirement
14
+ - Quality: Enhanced post-processing β€” aggressive footer/artifact removal
 
 
15
  """
16
 
17
  import asyncio
 
 
 
 
 
18
  import re
 
19
  import shutil
 
20
  import tempfile
21
  import time
 
 
22
  from contextlib import asynccontextmanager
23
  from pathlib import Path
24
+ from typing import Optional
 
25
  from uuid import uuid4
26
 
 
27
  import httpx
 
28
  from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ from auth import _validate_url, verify_token
31
+ from config import (
32
+ GEMINI_API_KEY,
33
+ GEMINI_CONCURRENCY,
34
+ GEMINI_MODEL,
35
+ GEMINI_TIMEOUT,
36
+ IMAGES_SCALE,
37
+ MAX_FILE_SIZE_BYTES,
38
+ MAX_FILE_SIZE_MB,
39
+ RENDER_DPI,
40
+ logger,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  )
42
+ from models import HealthResponse, ParseResponse, URLParseRequest
43
+ from pipeline import (
44
+ _convert_document,
45
+ _create_images_zip,
46
+ _get_pipeline,
47
+ _save_downloaded_content,
48
+ _save_uploaded_file,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  )
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # ---------------------------------------------------------------------------
53
  # Application Lifespan
54
  # ---------------------------------------------------------------------------
 
56
 
57
  @asynccontextmanager
58
  async def lifespan(app: FastAPI):
59
+ """Startup: initialize PaddleOCR-VL-1.5 pipeline."""
60
  logger.info("=" * 60)
61
+ logger.info("Starting Docling VLM Parser API v5.0.0...")
62
+ logger.info("Initializing PaddleOCR-VL-1.5 pipeline...")
63
+ _get_pipeline()
64
+ logger.info("PaddleOCR-VL-1.5 ready")
 
 
 
 
 
 
 
65
 
 
 
 
66
  logger.info(f"Render DPI: {RENDER_DPI}")
67
  logger.info(f"Images scale: {IMAGES_SCALE}")
68
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
 
70
  logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
71
  logger.info(f"Gemini Timeout: {GEMINI_TIMEOUT}s, Concurrency: {GEMINI_CONCURRENCY}")
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  logger.info("=" * 60)
74
+ logger.info("Docling VLM Parser API ready (PaddleOCR-VL-1.5 + Gemini hybrid)")
75
  logger.info("=" * 60)
76
  yield
77
  logger.info("Shutting down Docling VLM Parser API...")
 
83
 
84
  app = FastAPI(
85
  title="Docling VLM Parser API",
86
+ description="PaddleOCR-VL-1.5 + Gemini 3 Flash hybrid parser",
87
+ version="5.0.0",
88
  lifespan=lifespan,
89
  )
90
 
 
97
  @app.get("/", response_model=HealthResponse)
98
  async def health_check() -> HealthResponse:
99
  """Health check endpoint."""
 
 
 
 
 
 
 
 
 
 
100
  return HealthResponse(
101
  status="healthy",
102
+ version="5.0.0",
103
+ model="PaddleOCR-VL-1.5",
104
+ gemini_status="configured" if GEMINI_API_KEY else "not set",
 
 
105
  images_scale=IMAGES_SCALE,
106
  )
107
 
 
116
  include_images: bool = Form(default=False, description="Include extracted images"),
117
  _token: str = Depends(verify_token),
118
  ) -> ParseResponse:
119
+ """Parse a document file using PaddleOCR-VL-1.5 + Gemini hybrid pipeline."""
120
  request_id = str(uuid4())[:8]
121
  start_time = time.time()
122
 
 
155
  detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
156
  )
157
 
158
+ logger.info(f"[{request_id}] Model: PaddleOCR-VL-1.5")
 
 
159
  logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
160
 
161
  temp_dir = tempfile.mkdtemp()
 
171
  _convert_document,
172
  input_path,
173
  output_dir,
 
174
  include_images,
175
  request_id,
176
  start_page,
 
197
  images_zip=images_zip,
198
  image_count=image_count,
199
  pages_processed=pages_processed,
200
+ device_used="gpu",
201
+ vlm_model="PaddleOCR-VL-1.5",
202
  )
203
 
204
  except Exception as e:
 
220
  request: URLParseRequest,
221
  _token: str = Depends(verify_token),
222
  ) -> ParseResponse:
223
+ """Parse a document from a URL using PaddleOCR-VL-1.5 + Gemini hybrid pipeline."""
224
  request_id = str(uuid4())[:8]
225
  start_time = time.time()
226
 
 
280
  output_dir = Path(temp_dir) / "output"
281
  output_dir.mkdir(exist_ok=True)
282
 
283
+ logger.info(f"[{request_id}] Model: PaddleOCR-VL-1.5")
 
 
284
  logger.info(
285
  f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
286
  )
 
289
  _convert_document,
290
  input_path,
291
  output_dir,
 
292
  request.include_images,
293
  request_id,
294
  request.start_page,
 
315
  images_zip=images_zip,
316
  image_count=image_count,
317
  pages_processed=pages_processed,
318
+ device_used="gpu",
319
+ vlm_model="PaddleOCR-VL-1.5",
320
  )
321
 
322
  except httpx.HTTPError as e:
auth.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bearer token authentication and URL validation (SSRF protection)."""
2
+
3
+ import ipaddress
4
+ import secrets
5
+ import socket
6
+ from urllib.parse import urlparse
7
+
8
+ from fastapi import Depends, HTTPException
9
+ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
10
+
11
+ from config import API_TOKEN, BLOCKED_HOSTNAMES
12
+
13
+ security = HTTPBearer()
14
+
15
+
16
+ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> str:
17
+ """Verify the API token from Authorization header."""
18
+ if not API_TOKEN:
19
+ raise HTTPException(
20
+ status_code=500,
21
+ detail="No API token configured on server",
22
+ )
23
+
24
+ token = credentials.credentials
25
+ if not secrets.compare_digest(token, API_TOKEN):
26
+ raise HTTPException(
27
+ status_code=401,
28
+ detail="Invalid API token",
29
+ )
30
+ return token
31
+
32
+
33
+ def _validate_url(url: str) -> None:
34
+ """Validate URL to prevent SSRF attacks."""
35
+ try:
36
+ parsed = urlparse(url)
37
+ except Exception as e:
38
+ raise HTTPException(
39
+ status_code=400,
40
+ detail=f"Invalid URL format: {str(e)}",
41
+ )
42
+
43
+ if parsed.scheme not in ("http", "https"):
44
+ raise HTTPException(
45
+ status_code=400,
46
+ detail=f"Invalid URL scheme '{parsed.scheme}'. Only http and https are allowed.",
47
+ )
48
+
49
+ hostname = parsed.hostname
50
+ if not hostname:
51
+ raise HTTPException(
52
+ status_code=400,
53
+ detail="Invalid URL: missing hostname.",
54
+ )
55
+
56
+ hostname_lower = hostname.lower()
57
+ if hostname_lower in BLOCKED_HOSTNAMES:
58
+ raise HTTPException(
59
+ status_code=400,
60
+ detail="Access to internal/metadata services is not allowed.",
61
+ )
62
+
63
+ blocked_patterns = ["metadata", "internal", "localhost", "127.0.0.1", "::1"]
64
+ for pattern in blocked_patterns:
65
+ if pattern in hostname_lower:
66
+ raise HTTPException(
67
+ status_code=400,
68
+ detail="Access to internal/metadata services is not allowed.",
69
+ )
70
+
71
+ try:
72
+ ip_str = socket.gethostbyname(hostname)
73
+ ip = ipaddress.ip_address(ip_str)
74
+ except socket.gaierror:
75
+ raise HTTPException(
76
+ status_code=400,
77
+ detail=f"Could not resolve hostname: {hostname}",
78
+ )
79
+ except ValueError as e:
80
+ raise HTTPException(
81
+ status_code=400,
82
+ detail=f"Invalid IP address resolved: {str(e)}",
83
+ )
84
+
85
+ if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_multicast:
86
+ raise HTTPException(
87
+ status_code=400,
88
+ detail="Access to private/internal IP addresses is not allowed.",
89
+ )
config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration, environment variables, and logging setup for the Docling VLM Parser."""
2
+
3
+ import logging
4
+ import os
5
+
6
+ # Configure logging
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s | %(levelname)-8s | %(message)s",
10
+ datefmt="%Y-%m-%d %H:%M:%S",
11
+ )
12
+ logger = logging.getLogger("docling-parser")
13
+
14
+ # Security
15
+ API_TOKEN = os.getenv("API_TOKEN")
16
+
17
+ # Configuration
18
+ IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
19
+ MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
20
+ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
21
+ RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
22
+
23
+ # Gemini API Configuration (table page enhancement)
24
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
25
+ GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
26
+ GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
27
+ GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8"))
28
+
29
+ # Blocked hostnames for SSRF protection
30
+ BLOCKED_HOSTNAMES = {
31
+ "localhost",
32
+ "metadata",
33
+ "metadata.google.internal",
34
+ "metadata.google",
35
+ "169.254.169.254",
36
+ "fd00:ec2::254",
37
+ }
gemini.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gemini API extraction function for table page enhancement."""
2
+
3
+ import base64
4
+ import re
5
+ import time
6
+ from typing import Optional
7
+
8
+ import httpx
9
+
10
+ from config import GEMINI_API_KEY, GEMINI_MODEL, GEMINI_TIMEOUT, logger
11
+
12
+ # Strip code fence wrappers (Gemini sometimes wraps output)
13
+ _CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
14
+ _CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
15
+
16
+
17
+ def _gemini_extract_page(
18
+ page_image_bytes: bytes, request_id: str = "", page_no: int = 0
19
+ ) -> Optional[str]:
20
+ """Send a page image to Gemini 2.5 Flash for high-quality extraction.
21
+
22
+ Used for table pages where PaddleOCR output is insufficient.
23
+ Returns the full page markdown (text + tables), or None on failure.
24
+ """
25
+ if not GEMINI_API_KEY:
26
+ logger.warning(f"[{request_id}] GEMINI_API_KEY not set β€” skipping Gemini extraction")
27
+ return None
28
+
29
+ b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
30
+
31
+ payload = {
32
+ "contents": [
33
+ {
34
+ "parts": [
35
+ {
36
+ "inline_data": {
37
+ "mime_type": "image/png",
38
+ "data": b64_image,
39
+ }
40
+ },
41
+ {
42
+ "text": (
43
+ "Convert this document page to clean markdown format.\n\n"
44
+ "Rules:\n"
45
+ "- Extract ALL text content exactly as written β€” do not paraphrase or summarize\n"
46
+ "- Use ## for main section headings and ### for subsection headings\n"
47
+ "- Preserve lists, paragraphs, bullet points, and document structure\n"
48
+ "- For tables:\n"
49
+ " * Read EVERY column header exactly as printed on the page\n"
50
+ " * Include ALL columns even if the table is very wide\n"
51
+ " * Format as markdown tables with | delimiters and --- separator rows\n"
52
+ " * Each data row MUST have the same number of | cells as the header row\n"
53
+ " * Preserve multi-line cell content β€” use <br> for line breaks within cells\n"
54
+ " * For financial/lease tables, preserve ALL numbers, dates, and terms exactly\n"
55
+ " * Add spaces between words β€” never concatenate (e.g., 'CAP Rate' not 'CAPRate')\n"
56
+ "- Do NOT wrap output in code fences (no ```)\n"
57
+ "- Do NOT add image descriptions, [Image:] tags, or describe visual elements\n"
58
+ "- Do NOT include page headers, footers, page numbers, or repeated branding\n"
59
+ "- Do NOT extract text from map images or photographs\n"
60
+ "- Output ONLY the extracted markdown content, nothing else"
61
+ ),
62
+ },
63
+ ],
64
+ }
65
+ ],
66
+ "generationConfig": {
67
+ "temperature": 0.1,
68
+ "maxOutputTokens": 32768,
69
+ },
70
+ }
71
+
72
+ url = (
73
+ f"https://generativelanguage.googleapis.com/v1beta/models/"
74
+ f"{GEMINI_MODEL}:generateContent?key={GEMINI_API_KEY}"
75
+ )
76
+
77
+ for attempt in range(1, 3):
78
+ try:
79
+ timeout = GEMINI_TIMEOUT * (1.5 if attempt > 1 else 1.0)
80
+ response = httpx.post(url, json=payload, timeout=timeout)
81
+
82
+ if response.status_code == 429:
83
+ # Rate limited β€” wait and retry
84
+ logger.warning(
85
+ f"[{request_id}] Gemini rate limited on page {page_no + 1}, "
86
+ f"attempt {attempt}. Waiting 5s..."
87
+ )
88
+ time.sleep(5)
89
+ continue
90
+
91
+ if response.status_code != 200:
92
+ try:
93
+ err = response.json()
94
+ msg = str(err.get("error", {}).get("message", str(err)[:300]))
95
+ except Exception:
96
+ msg = response.text[:300]
97
+ logger.error(
98
+ f"[{request_id}] Gemini error ({response.status_code}) "
99
+ f"page {page_no + 1}: {msg}"
100
+ )
101
+ if attempt == 1:
102
+ continue
103
+ return None
104
+
105
+ result = response.json()
106
+ candidates = result.get("candidates", [])
107
+ if not candidates:
108
+ logger.warning(f"[{request_id}] Gemini returned no candidates for page {page_no + 1}")
109
+ return None
110
+
111
+ parts = candidates[0].get("content", {}).get("parts", [])
112
+ if not parts:
113
+ return None
114
+
115
+ content = parts[0].get("text", "")
116
+
117
+ # Clean up: strip code fences if Gemini wraps output
118
+ content = _CODE_FENCE_PATTERN.sub("", content)
119
+ content = _CODE_FENCE_END.sub("", content)
120
+
121
+ return content.strip() if content.strip() else None
122
+
123
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
124
+ if attempt == 1:
125
+ logger.warning(
126
+ f"[{request_id}] Gemini attempt {attempt} failed on page {page_no + 1}: {e}. Retrying..."
127
+ )
128
+ continue
129
+ logger.error(f"[{request_id}] Gemini failed after 2 attempts on page {page_no + 1}: {e}")
130
+ return None
131
+
132
+ return None
models.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic models for API request/response schemas."""
2
+
3
+ from typing import Optional, Union
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class ParseResponse(BaseModel):
9
+ """Response model for document parsing."""
10
+
11
+ success: bool
12
+ markdown: Optional[str] = None
13
+ json_content: Optional[Union[dict, list]] = None
14
+ images_zip: Optional[str] = None
15
+ image_count: int = 0
16
+ error: Optional[str] = None
17
+ pages_processed: int = 0
18
+ device_used: Optional[str] = None
19
+ vlm_model: Optional[str] = None
20
+
21
+
22
+ class HealthResponse(BaseModel):
23
+ """Health check response."""
24
+
25
+ status: str
26
+ version: str
27
+ model: str
28
+ gemini_status: str = "unknown"
29
+ images_scale: float = 2.0
30
+
31
+
32
+ class URLParseRequest(BaseModel):
33
+ """Request model for URL-based parsing."""
34
+
35
+ url: str
36
+ output_format: str = "markdown"
37
+ images_scale: Optional[float] = None
38
+ start_page: int = 0
39
+ end_page: Optional[int] = None
40
+ include_images: bool = False
pipeline.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PaddleOCR-VL pipeline, hybrid conversion logic, and file helpers."""
2
+
3
+ import base64
4
+ import io
5
+ import re
6
+ import shutil
7
+ import time
8
+ import zipfile
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from pathlib import Path
11
+ from typing import BinaryIO, Optional
12
+
13
+ from paddleocr import PaddleOCRVL
14
+
15
+ from config import GEMINI_API_KEY, GEMINI_CONCURRENCY, GEMINI_MODEL, logger
16
+ from gemini import _gemini_extract_page
17
+ from postprocess import _post_process_merged_markdown
18
+ from rendering import _pdf_to_page_images
19
+
20
+ # Global PaddleOCR-VL pipeline instance
21
+ _pipeline = None
22
+
23
+
24
+ def _get_pipeline():
25
+ """Get or create the global PaddleOCR-VL-1.5 pipeline instance."""
26
+ global _pipeline
27
+ if _pipeline is None:
28
+ _pipeline = PaddleOCRVL()
29
+ return _pipeline
30
+
31
+
32
+ def _page_has_tables(result) -> bool:
33
+ """Check if PaddleOCR result contains table elements from layout analysis.
34
+
35
+ Uses layout detection labels and falls back to markdown pattern matching.
36
+ """
37
+ try:
38
+ # Try accessing layout detection results
39
+ if hasattr(result, 'json') and result.json:
40
+ json_data = result.json
41
+ if isinstance(json_data, dict):
42
+ for block in json_data.get('layout_det', []):
43
+ if block.get('label', '').lower() == 'table':
44
+ return True
45
+ # Fallback: check markdown content for table patterns
46
+ md = result.markdown
47
+ if isinstance(md, dict):
48
+ md_text = md.get('markdown_texts', '')
49
+ else:
50
+ md_text = str(md)
51
+ return bool(re.search(r'^\|.+\|.+\|$', md_text, re.MULTILINE))
52
+ except Exception:
53
+ return False
54
+
55
+
56
+ def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
57
+ """Sync helper to save uploaded file to disk."""
58
+ with open(input_path, "wb") as f:
59
+ shutil.copyfileobj(file_obj, f)
60
+
61
+
62
+ def _save_downloaded_content(input_path: Path, content: bytes) -> None:
63
+ """Sync helper to save downloaded content to disk."""
64
+ with open(input_path, "wb") as f:
65
+ f.write(content)
66
+
67
+
68
+ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
69
+ """Create a zip file from extracted images."""
70
+ image_dir = output_dir / "images"
71
+ if not image_dir.exists():
72
+ return None, 0
73
+
74
+ image_extensions = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff", ".webp"}
75
+ zip_buffer = io.BytesIO()
76
+ image_count = 0
77
+
78
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
79
+ for img_path in image_dir.glob("*"):
80
+ if img_path.is_file() and img_path.suffix.lower() in image_extensions:
81
+ try:
82
+ zf.write(img_path, f"images/{img_path.name}")
83
+ image_count += 1
84
+ except Exception as e:
85
+ logger.warning(f"Failed to add image {img_path} to zip: {e}")
86
+
87
+ if image_count == 0:
88
+ return None, 0
89
+
90
+ return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
91
+
92
+
93
+ def _convert_document(
94
+ input_path: Path,
95
+ output_dir: Path,
96
+ include_images: bool,
97
+ request_id: str,
98
+ start_page: int = 0,
99
+ end_page: Optional[int] = None,
100
+ ) -> tuple:
101
+ """
102
+ PaddleOCR-VL-1.5 + Gemini hybrid conversion.
103
+
104
+ Pass 1 (GPU): PaddleOCR-VL-1.5 on full PDF (native document parsing)
105
+ Detect: Find table pages from layout analysis
106
+ Pass 2 (API): Gemini 3 Flash ONLY on table pages (high-quality tables)
107
+ Merge: Gemini for table pages, PaddleOCR for everything else
108
+
109
+ Returns: (markdown_content, json_content, pages_processed, image_count)
110
+ """
111
+ overall_start = time.time()
112
+
113
+ # ---- PASS 1: PaddleOCR-VL-1.5 on full PDF ----
114
+ pipeline = _get_pipeline()
115
+ paddle_start = time.time()
116
+ output = pipeline.predict(str(input_path))
117
+ paddle_time = time.time() - paddle_start
118
+
119
+ # Collect per-page markdown and detect table pages
120
+ page_markdowns = []
121
+ table_pages = set()
122
+ for i, res in enumerate(output):
123
+ md_data = res.markdown
124
+ page_markdowns.append(md_data)
125
+ # Check if this page has tables from layout analysis
126
+ if _page_has_tables(res):
127
+ table_pages.add(i)
128
+
129
+ logger.info(
130
+ f"[{request_id}] Pass 1: PaddleOCR-VL-1.5 processed {len(page_markdowns)} pages "
131
+ f"in {paddle_time:.2f}s β€” {len(table_pages)} table pages detected"
132
+ )
133
+
134
+ # ---- PASS 2: Gemini on table pages only ----
135
+ gemini_page_texts: dict[int, str] = {}
136
+ gemini_time = 0.0
137
+
138
+ if table_pages and GEMINI_API_KEY:
139
+ logger.info(
140
+ f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(table_pages)} table pages "
141
+ f"({GEMINI_CONCURRENCY} concurrent)"
142
+ )
143
+
144
+ # Render table page images for Gemini
145
+ page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
146
+ page_image_map = {pno: pbytes for pno, pbytes in page_images}
147
+
148
+ gemini_start = time.time()
149
+ with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
150
+ futures = {
151
+ executor.submit(
152
+ _gemini_extract_page, page_image_map[pno], request_id, pno
153
+ ): pno
154
+ for pno in table_pages
155
+ if pno in page_image_map
156
+ }
157
+ for future in as_completed(futures):
158
+ pno = futures[future]
159
+ try:
160
+ text = future.result()
161
+ if text:
162
+ gemini_page_texts[pno] = text
163
+ logger.info(
164
+ f"[{request_id}] Gemini processed table page {pno + 1} "
165
+ f"({len(text)} chars)"
166
+ )
167
+ except Exception as e:
168
+ logger.warning(f"[{request_id}] Gemini failed page {pno + 1}: {e}")
169
+
170
+ gemini_time = time.time() - gemini_start
171
+ logger.info(
172
+ f"[{request_id}] Pass 2 completed in {gemini_time:.2f}s β€” "
173
+ f"{len(gemini_page_texts)}/{len(table_pages)} table pages enhanced via Gemini"
174
+ )
175
+ elif table_pages and not GEMINI_API_KEY:
176
+ logger.warning(
177
+ f"[{request_id}] {len(table_pages)} table pages detected but GEMINI_API_KEY not set β€” "
178
+ f"using PaddleOCR output for tables"
179
+ )
180
+
181
+ # ---- MERGE: Gemini for table pages, PaddleOCR for others ----
182
+ md_parts: list[str] = []
183
+ for i, md_data in enumerate(page_markdowns):
184
+ if i in gemini_page_texts:
185
+ md_parts.append(gemini_page_texts[i])
186
+ else:
187
+ # Extract markdown text from PaddleOCR result
188
+ if isinstance(md_data, dict):
189
+ md_text = md_data.get("markdown_texts", "")
190
+ else:
191
+ md_text = str(md_data)
192
+ md_parts.append(md_text)
193
+
194
+ markdown_content = "\n\n".join(md_parts)
195
+
196
+ # Post-process: fix cross-page artifacts, deduplicate headers, clean tables
197
+ pages_processed = len(page_markdowns)
198
+ if pages_processed > 1:
199
+ markdown_content = _post_process_merged_markdown(markdown_content)
200
+
201
+ total_time = time.time() - overall_start
202
+
203
+ logger.info(
204
+ f"[{request_id}] v5.0.0 conversion complete: {pages_processed} pages β€” "
205
+ f"PaddleOCR {paddle_time:.1f}s + Gemini {gemini_time:.1f}s = {total_time:.2f}s total"
206
+ )
207
+ if pages_processed > 0:
208
+ logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
209
+
210
+ return markdown_content, None, pages_processed, 0
postprocess.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Post-processing functions and regex patterns for markdown cleanup."""
2
+
3
+ import re
4
+
5
+ # ---------------------------------------------------------------------------
6
+ # Post-processing regex patterns
7
+ # ---------------------------------------------------------------------------
8
+
9
+ # Day-of-week date lines (e.g., "Thursday, October 31, 2024")
10
+ _STANDALONE_DATE = re.compile(
11
+ r"^\s*(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
12
+ r"(?:January|February|March|April|May|June|July|August|September|"
13
+ r"October|November|December)\s+\d{1,2},\s+\d{4}\s*$",
14
+ re.MULTILINE,
15
+ )
16
+ # Standalone time (e.g., "11:30 AM")
17
+ _STANDALONE_TIME = re.compile(r"^\s*\d{1,2}:\d{2}\s*(?:AM|PM)\s*$", re.MULTILINE)
18
+ # Page footer patterns: "N | address" or "N address N" (e.g., "2 | 8575 W Golf Rd, Niles, IL 60714 | 3")
19
+ _PAGE_FOOTER = re.compile(
20
+ r"^\s*\d{1,3}\s*\|?\s*\d{2,5}\s+\w.*(?:Rd|St|Ave|Blvd|Dr|Ln|Way|Ct)\b.*\d{5}.*$",
21
+ re.MULTILINE,
22
+ )
23
+ # Standalone page number lines (e.g., "12" alone on a line)
24
+ _STANDALONE_PAGE_NUM = re.compile(r"^\s*\d{1,3}\s*$", re.MULTILINE)
25
+ # Branding footer lines: "Text | Text | N" or "Text | Text - Text N" pattern
26
+ # Matches lines with 2+ pipe-separated segments ending in a page number,
27
+ # where total line length > 30 chars (to avoid matching short legitimate text)
28
+ _BRANDING_FOOTER = re.compile(
29
+ r"^\s*[A-Za-z][^|]{5,}\|[^|]+\|?\s*\d{1,3}\s*$",
30
+ re.MULTILINE,
31
+ )
32
+ # Short repeated location lines that appear as page artifacts (e.g., "Niles, IL" alone)
33
+ _SHORT_LOCATION_LINE = re.compile(
34
+ r"^\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z]{2}\s*$", re.MULTILINE
35
+ )
36
+ # Numbered section pattern: "N. TITLE" where N is 1-99 and TITLE is mostly uppercase
37
+ _NUMBERED_SECTION = re.compile(r"^(\d{1,2})\.\s+([A-Z][A-Z\s\-/&,]+(?:\.\s*)?)")
38
+
39
+ # Table row with ALL empty cells (e.g., "| | | | |")
40
+ _EMPTY_TABLE_ROW = re.compile(r"^\|(?:\s*\|)+\s*$", re.MULTILINE)
41
+ # Trailing empty cells in a table row (e.g., "| data | data | | | |")
42
+ _TRAILING_EMPTY_CELLS = re.compile(r"(?:\s*\|\s*){2,}\s*$")
43
+ # Table separator row (e.g., "|---|---|---|")
44
+ _TABLE_SEP_ROW = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$")
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Post-Processing Functions
49
+ # ---------------------------------------------------------------------------
50
+
51
+
52
+ def _post_process_merged_markdown(content: str) -> str:
53
+ """Post-process merged multi-page markdown to fix cross-page artifacts.
54
+
55
+ Applied after all pages are concatenated. Fixes:
56
+ - Duplicate document headings (re-extracted page headers)
57
+ - Duplicate short metadata lines (subtitles, dates repeated per page)
58
+ - Page footer/header artifacts (standalone dates, times, page numbers)
59
+ - Numbered section heading normalization (consistent ## levels)
60
+ - Table artifacts (empty rows, trailing empty cells)
61
+ - Cross-page table continuations (merge split tables)
62
+ - Excessive whitespace
63
+ """
64
+ content = _deduplicate_headings(content)
65
+ content = _deduplicate_short_blocks(content)
66
+ content = _remove_page_boundary_artifacts(content)
67
+ content = _normalize_numbered_headings(content)
68
+ content = _clean_table_artifacts(content)
69
+ content = _merge_split_tables(content)
70
+ # Normalize runs of 4+ newlines to 3
71
+ content = re.sub(r"\n{4,}", "\n\n\n", content)
72
+ return content.strip()
73
+
74
+
75
+ def _deduplicate_headings(content: str) -> str:
76
+ """Remove duplicate heading lines, keeping only the first occurrence.
77
+
78
+ When processing each page, page headers/document titles may be re-extracted.
79
+ This removes exact duplicate headings while preserving table rows and body text.
80
+ """
81
+ lines = content.split("\n")
82
+ seen_headings: set[str] = set()
83
+ result: list[str] = []
84
+
85
+ for line in lines:
86
+ stripped = line.strip()
87
+ if stripped.startswith("#"):
88
+ # Normalize heading for comparison (lowercase, strip trailing #)
89
+ key = stripped.lstrip("#").strip().lower()
90
+ if key and key in seen_headings:
91
+ continue # Skip duplicate heading
92
+ if key:
93
+ seen_headings.add(key)
94
+ result.append(line)
95
+
96
+ return "\n".join(result)
97
+
98
+
99
+ def _deduplicate_short_blocks(content: str) -> str:
100
+ """Remove duplicate short text blocks that repeat across pages.
101
+
102
+ When processing each page, document subtitles, metadata lines, and other
103
+ short repeating text may be re-extracted. This removes exact duplicates
104
+ of short non-table blocks (< 120 chars).
105
+ """
106
+ blocks = content.split("\n\n")
107
+ seen: set[str] = set()
108
+ result: list[str] = []
109
+
110
+ for block in blocks:
111
+ stripped = block.strip()
112
+ if not stripped:
113
+ result.append(block)
114
+ continue
115
+
116
+ # Only deduplicate short, non-table, non-heading blocks
117
+ is_table = stripped.startswith("|") and "|" in stripped[1:]
118
+ is_heading = stripped.startswith("#")
119
+ if is_table or is_heading or len(stripped) > 120:
120
+ result.append(block)
121
+ continue
122
+
123
+ key = stripped.lower()
124
+ if key in seen:
125
+ continue # Skip duplicate short block
126
+
127
+ seen.add(key)
128
+ result.append(block)
129
+
130
+ return "\n\n".join(result)
131
+
132
+
133
+ def _remove_page_boundary_artifacts(content: str) -> str:
134
+ """Remove page footer/header artifacts like standalone dates, times, page numbers, and footers."""
135
+ content = _STANDALONE_DATE.sub("", content)
136
+ content = _STANDALONE_TIME.sub("", content)
137
+ content = _PAGE_FOOTER.sub("", content)
138
+ content = _STANDALONE_PAGE_NUM.sub("", content)
139
+ # Remove repeated patterns (only removed when they appear 3+ times)
140
+ content = _remove_repeated_lines(content, _BRANDING_FOOTER, min_repeats=3)
141
+ content = _remove_repeated_lines(content, _SHORT_LOCATION_LINE, min_repeats=3)
142
+ return content
143
+
144
+
145
+ def _remove_repeated_lines(content: str, pattern: re.Pattern, min_repeats: int = 3) -> str:
146
+ """Remove lines matching a pattern that appear min_repeats+ times (clearly artifacts)."""
147
+ counts: dict[str, int] = {}
148
+ for m in pattern.finditer(content):
149
+ key = m.group(0).strip().lower()
150
+ counts[key] = counts.get(key, 0) + 1
151
+
152
+ repeated = {k for k, v in counts.items() if v >= min_repeats}
153
+ if not repeated:
154
+ return content
155
+
156
+ lines = content.split("\n")
157
+ result: list[str] = []
158
+ for line in lines:
159
+ if line.strip().lower() in repeated:
160
+ continue
161
+ result.append(line)
162
+ return "\n".join(result)
163
+
164
+
165
+ def _normalize_numbered_headings(content: str) -> str:
166
+ """Normalize numbered section headings to consistent ## level.
167
+
168
+ Inconsistently formatted numbered sections like "3. OCCUPANCY" β€”
169
+ some get ## headings, some are plain text. This detects the pattern
170
+ and ensures all numbered sections at the same level use ## headings.
171
+ """
172
+ lines = content.split("\n")
173
+ result: list[str] = []
174
+
175
+ # First pass: detect which numbered sections exist and their heading status
176
+ sections_with_heading: set[int] = set()
177
+ sections_without_heading: set[int] = set()
178
+
179
+ for line in lines:
180
+ stripped = line.strip()
181
+ # Already a heading like "## 3. OCCUPANCY"
182
+ heading_match = re.match(r"^#{1,3}\s+(\d{1,2})\.\s+[A-Z]", stripped)
183
+ if heading_match:
184
+ sections_with_heading.add(int(heading_match.group(1)))
185
+ continue
186
+ # Plain text like "3. OCCUPANCY. Tenant shall..."
187
+ plain_match = _NUMBERED_SECTION.match(stripped)
188
+ if plain_match:
189
+ sections_without_heading.add(int(plain_match.group(1)))
190
+
191
+ # If there's a mix of headed and non-headed numbered sections, normalize
192
+ if sections_with_heading and sections_without_heading:
193
+ for i, line in enumerate(lines):
194
+ stripped = line.strip()
195
+ # Check if this is a non-headed numbered section that should be a heading
196
+ plain_match = _NUMBERED_SECTION.match(stripped)
197
+ if plain_match:
198
+ section_num = int(plain_match.group(1))
199
+ if section_num in sections_without_heading:
200
+ # Check that it looks like a section start (followed by text)
201
+ # Split at the first sentence end to make the heading
202
+ # Extract just "N. TITLE." as heading, keep body text
203
+ title_end = plain_match.end()
204
+ title = stripped[:title_end].rstrip(".")
205
+ body = stripped[title_end:].strip()
206
+ if body:
207
+ result.append(f"## {title}")
208
+ result.append(body)
209
+ else:
210
+ result.append(f"## {title}")
211
+ continue
212
+ result.append(line)
213
+ else:
214
+ result = lines
215
+
216
+ return "\n".join(result)
217
+
218
+
219
+ def _clean_table_artifacts(content: str) -> str:
220
+ """Clean table formatting artifacts.
221
+
222
+ - Removes table rows where ALL cells are empty
223
+ - Strips trailing empty cells from table rows
224
+ - Removes orphaned separator rows not preceded by a header
225
+ """
226
+ lines = content.split("\n")
227
+ result: list[str] = []
228
+
229
+ for i, line in enumerate(lines):
230
+ stripped = line.strip()
231
+
232
+ # Skip completely empty table rows (| | | | |)
233
+ if _EMPTY_TABLE_ROW.match(stripped):
234
+ continue
235
+
236
+ # Clean trailing empty cells from table data rows
237
+ if stripped.startswith("|") and "|" in stripped[1:]:
238
+ # Don't touch separator rows
239
+ if not _TABLE_SEP_ROW.match(stripped):
240
+ # Remove trailing empty cells
241
+ cleaned = _TRAILING_EMPTY_CELLS.sub(" |", stripped)
242
+ result.append(cleaned)
243
+ continue
244
+
245
+ result.append(line)
246
+
247
+ return "\n".join(result)
248
+
249
+
250
+ def _is_table_line(line: str) -> bool:
251
+ """Check if a line is a markdown table row or separator."""
252
+ s = line.strip()
253
+ return bool(s.startswith("|") and s.endswith("|") and s.count("|") >= 3)
254
+
255
+
256
+ def _count_columns(line: str) -> int:
257
+ """Count the number of columns in a table row."""
258
+ s = line.strip()
259
+ if not s.startswith("|"):
260
+ return 0
261
+ # Split by | and count non-boundary segments
262
+ parts = s.split("|")
263
+ # First and last are empty strings from leading/trailing |
264
+ return max(0, len(parts) - 2)
265
+
266
+
267
+ def _merge_split_tables(content: str) -> str:
268
+ """Merge table continuations that were split across pages.
269
+
270
+ Detects when non-table content (whitespace, duplicate metadata) separates
271
+ what should be a single table, and merges the data rows.
272
+ """
273
+ lines = content.split("\n")
274
+ result: list[str] = []
275
+ i = 0
276
+
277
+ while i < len(lines):
278
+ result.append(lines[i])
279
+ i += 1
280
+
281
+ # Check if we just appended a table row and the next chunk looks like
282
+ # a table continuation (another table with similar column count)
283
+ if not _is_table_line(result[-1]):
284
+ continue
285
+
286
+ last_table_cols = _count_columns(result[-1])
287
+ if last_table_cols < 2:
288
+ continue
289
+
290
+ # Look ahead past empty lines / short non-table lines
291
+ j = i
292
+ gap_lines: list[str] = []
293
+ while j < len(lines):
294
+ s = lines[j].strip()
295
+ if s == "":
296
+ gap_lines.append(lines[j])
297
+ j += 1
298
+ continue
299
+ break
300
+
301
+ if j >= len(lines):
302
+ continue
303
+
304
+ # Check if the next non-empty line starts a table
305
+ if not _is_table_line(lines[j]):
306
+ continue
307
+
308
+ next_table_cols = _count_columns(lines[j])
309
+
310
+ # If column counts are close (within 30%), it's likely a continuation
311
+ if last_table_cols < 2 or next_table_cols < 2:
312
+ continue
313
+ ratio = min(last_table_cols, next_table_cols) / max(last_table_cols, next_table_cols)
314
+ if ratio < 0.7:
315
+ continue
316
+
317
+ # Check if the new table starts with header + separator (indicating
318
+ # re-extracted headers on the next page)
319
+ has_new_header = False
320
+ if _is_table_line(lines[j]):
321
+ # Look for a separator row in the next 1-2 lines
322
+ for k in range(j + 1, min(j + 3, len(lines))):
323
+ if _TABLE_SEP_ROW.match(lines[k].strip()):
324
+ has_new_header = True
325
+ break
326
+
327
+ if has_new_header:
328
+ # Skip the gap, skip the duplicate header + separator, keep data rows
329
+ # Find the separator row
330
+ skip_to = j
331
+ while skip_to < len(lines):
332
+ if _TABLE_SEP_ROW.match(lines[skip_to].strip()):
333
+ skip_to += 1 # Skip past separator
334
+ break
335
+ skip_to += 1
336
+ i = skip_to
337
+ else:
338
+ # No header β€” just skip the gap and append the continuation rows
339
+ i = j
340
+
341
+ return "\n".join(result)
rendering.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PDF-to-page-images rendering and image preprocessing (CLAHE)."""
2
+
3
+ import os
4
+ import tempfile
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import cv2
11
+ from pdf2image import convert_from_path
12
+
13
+ from config import RENDER_DPI, logger
14
+
15
+
16
+ def _preprocess_image_for_ocr(image_path: str) -> str:
17
+ """Enhance image quality for better OCR accuracy.
18
+
19
+ Applies CLAHE contrast enhancement only (fast).
20
+ Denoising was removed in v3.2.1 β€” it added ~10s/page with minimal
21
+ benefit for VLM-based OCR which handles noise well.
22
+ """
23
+ img = cv2.imread(image_path)
24
+ if img is None:
25
+ return image_path
26
+
27
+ # CLAHE contrast enhancement on L channel
28
+ lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
29
+ l, a, b = cv2.split(lab)
30
+ clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
31
+ l = clahe.apply(l)
32
+ lab = cv2.merge([l, a, b])
33
+ img = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
34
+
35
+ cv2.imwrite(image_path, img)
36
+ return image_path
37
+
38
+
39
+ def _render_single_page(
40
+ input_path: Path, page_idx: int, dpi: int
41
+ ) -> tuple[int, Optional[bytes]]:
42
+ """Render a single PDF page to PNG bytes with CLAHE preprocessing.
43
+
44
+ Returns (page_idx, png_bytes) or (page_idx, None) on failure.
45
+ """
46
+ try:
47
+ images = convert_from_path(
48
+ str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
49
+ )
50
+ if not images:
51
+ return page_idx, None
52
+
53
+ img = images[0]
54
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
55
+ tmp_path = tmp.name
56
+ img.save(tmp_path, format="PNG")
57
+
58
+ try:
59
+ _preprocess_image_for_ocr(tmp_path)
60
+ with open(tmp_path, "rb") as f:
61
+ return page_idx, f.read()
62
+ finally:
63
+ os.unlink(tmp_path)
64
+ except Exception as e:
65
+ logger.warning(f"Failed to render page {page_idx + 1}: {e}")
66
+ return page_idx, None
67
+
68
+
69
+ def _pdf_to_page_images(
70
+ input_path: Path,
71
+ request_id: str,
72
+ start_page: int = 0,
73
+ end_page: Optional[int] = None,
74
+ ) -> list[tuple[int, bytes]]:
75
+ """Convert PDF pages to PNG image bytes using parallel rendering.
76
+
77
+ Uses ThreadPoolExecutor for concurrent page rendering.
78
+ Returns list of (page_no, png_bytes) tuples, sorted by page number.
79
+ """
80
+ try:
81
+ from pdf2image.pdf2image import pdfinfo_from_path
82
+
83
+ info = pdfinfo_from_path(str(input_path))
84
+ total_pages = info["Pages"]
85
+ last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
86
+ except Exception as e:
87
+ logger.warning(f"[{request_id}] Could not get PDF info: {e}")
88
+ return []
89
+
90
+ page_indices = list(range(start_page, last_page))
91
+
92
+ start_time = time.time()
93
+ page_images: list[tuple[int, bytes]] = []
94
+
95
+ # Render pages in parallel (4 threads β€” I/O bound, not CPU bound for poppler)
96
+ with ThreadPoolExecutor(max_workers=4) as executor:
97
+ futures = {
98
+ executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
99
+ for idx in page_indices
100
+ }
101
+ for future in as_completed(futures):
102
+ page_idx, png_bytes = future.result()
103
+ if png_bytes is not None:
104
+ page_images.append((page_idx, png_bytes))
105
+
106
+ page_images.sort(key=lambda x: x[0])
107
+ render_time = time.time() - start_time
108
+ logger.info(
109
+ f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
110
+ f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
111
+ )
112
+ return page_images
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- # Docling VLM Parser API Dependencies
2
- # Optimized for HuggingFace Spaces with vLLM + Qwen3-VL-30B-A3B
3
 
4
- # Docling - IBM's document parsing library (VLM pipeline support)
5
- docling>=2.15.0
6
 
7
  # Web framework
8
  fastapi>=0.115.0
@@ -11,23 +11,17 @@ uvicorn[standard]>=0.32.0
11
  # File upload handling
12
  python-multipart>=0.0.9
13
 
14
- # HTTP client for URL parsing and vLLM health checks
15
  httpx>=0.27.0
16
 
17
- # Type checking
18
  pydantic>=2.0.0
19
 
20
- # Image preprocessing for degraded documents
21
  opencv-python-headless>=4.10.0
22
 
23
- # ONNX Runtime for Docling's RapidOCR text detection
24
- onnxruntime>=1.19.0
25
-
26
- # PDF to image conversion for VLM OCR pass
27
  pdf2image>=1.17.0
28
 
29
- # PDF page extraction (for creating mini-PDFs with only table pages)
30
- pypdf>=4.0.0
31
-
32
- # HuggingFace Hub for model downloads
33
  huggingface-hub>=0.25.0
 
1
+ # PaddleOCR-VL-1.5 + Gemini Hybrid Parser API Dependencies
2
+ # PaddlePaddle GPU is installed separately in the Dockerfile (requires special index URL)
3
 
4
+ # PaddleOCR with document parsing support (PaddleOCR-VL-1.5)
5
+ paddleocr[doc-parser]
6
 
7
  # Web framework
8
  fastapi>=0.115.0
 
11
  # File upload handling
12
  python-multipart>=0.0.9
13
 
14
+ # HTTP client for Gemini API calls and URL fetching
15
  httpx>=0.27.0
16
 
17
+ # Request/response models
18
  pydantic>=2.0.0
19
 
20
+ # Image preprocessing (CLAHE contrast enhancement)
21
  opencv-python-headless>=4.10.0
22
 
23
+ # PDF page rendering for Gemini page images
 
 
 
24
  pdf2image>=1.17.0
25
 
26
+ # Model utilities
 
 
 
27
  huggingface-hub>=0.25.0
start.sh CHANGED
@@ -1,84 +1,7 @@
1
- #!/usr/bin/env bash
2
- set -e
 
 
3
 
4
- # Force all output to be visible
5
- exec 2>&1
6
-
7
- echo "[startup] ====== Docling VLM Parser starting ======"
8
- echo "[startup] Date: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
9
- echo "[startup] GPU: $(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>&1 || echo 'NO GPU')"
10
- echo "[startup] HF cache: $(du -sh /home/user/.cache/huggingface 2>/dev/null || echo 'empty')"
11
-
12
- # ── Configuration ────────────────────────────────────────────────────────────
13
- VLLM_MODEL="Qwen/Qwen3-VL-30B-A3B-Instruct"
14
- VLLM_HOST="127.0.0.1"
15
- VLLM_PORT="8000"
16
- HEALTH_URL="http://${VLLM_HOST}:${VLLM_PORT}/health"
17
- POLL_INTERVAL=5
18
- MAX_WAIT=600
19
-
20
- # ── Start vLLM server in background ─────────────────────────────────────────
21
- echo "[startup] Starting vLLM server with model: ${VLLM_MODEL}"
22
-
23
- python3 -m vllm.entrypoints.openai.api_server \
24
- --model "${VLLM_MODEL}" \
25
- --host "${VLLM_HOST}" \
26
- --port "${VLLM_PORT}" \
27
- --max-num-seqs 16 \
28
- --max-model-len 65536 \
29
- --gpu-memory-utilization 0.85 \
30
- --dtype auto \
31
- --trust-remote-code \
32
- --limit-mm-per-prompt '{"image": 1}' \
33
- 2>&1 &
34
-
35
- VLLM_PID=$!
36
- echo "[startup] vLLM server started with PID ${VLLM_PID}"
37
-
38
- # ── Poll vLLM health endpoint until ready ────────────────────────────────────
39
- echo "[startup] Waiting for vLLM to become healthy (polling every ${POLL_INTERVAL}s, timeout ${MAX_WAIT}s)..."
40
-
41
- elapsed=0
42
- while [ "${elapsed}" -lt "${MAX_WAIT}" ]; do
43
- # Check if vLLM process is still alive
44
- if ! kill -0 "${VLLM_PID}" 2>/dev/null; then
45
- echo "[startup] ERROR: vLLM process (PID ${VLLM_PID}) died during startup"
46
- exit 1
47
- fi
48
-
49
- if curl -sf "${HEALTH_URL}" > /dev/null 2>&1; then
50
- echo "[startup] vLLM is healthy after ${elapsed}s"
51
- break
52
- fi
53
-
54
- # Heartbeat every 30s
55
- if [ $((elapsed % 30)) -eq 0 ] && [ "${elapsed}" -gt 0 ]; then
56
- echo "[startup] Still waiting for vLLM... ${elapsed}s elapsed"
57
- fi
58
-
59
- sleep "${POLL_INTERVAL}"
60
- elapsed=$((elapsed + POLL_INTERVAL))
61
- done
62
-
63
- if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then
64
- echo "[startup] ERROR: vLLM did not become healthy within ${MAX_WAIT}s"
65
- echo "[startup] Killing vLLM process (PID ${VLLM_PID})"
66
- kill "${VLLM_PID}" 2>/dev/null || true
67
- exit 1
68
- fi
69
-
70
- # ── Start FastAPI with vLLM cleanup on exit ──────────────────────────────────
71
- _cleanup() {
72
- echo "[startup] Shutting down vLLM (PID ${VLLM_PID})"
73
- kill "${VLLM_PID}" 2>/dev/null
74
- wait "${VLLM_PID}" 2>/dev/null
75
- }
76
- trap _cleanup EXIT TERM INT
77
-
78
- echo "[startup] Starting FastAPI server on 0.0.0.0:7860"
79
-
80
- python3 -m uvicorn app:app \
81
- --host 0.0.0.0 \
82
- --port 7860 \
83
- --workers 1 \
84
- --timeout-keep-alive 300
 
1
+ #!/bin/bash
2
+ # Start the PaddleOCR-VL + Gemini hybrid parser API
3
+ # Single process: FastAPI with PaddleOCR-VL-1.5 loaded in-process
4
+ # Note: Dockerfile should ensure this script is executable (chmod +x)
5
 
6
+ # Start FastAPI
7
+ exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1