Spaces:
Running on T4
Running on T4
Ibad ur Rehman commited on
Commit ·
c28aa68
1
Parent(s): b5db7b1
fix: update docling gemini parser
Browse files- app.py +47 -21
- pipeline.py +469 -33
- requirements.txt +2 -2
app.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
-
Docling
|
| 3 |
|
| 4 |
-
A FastAPI service using Docling
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import asyncio
|
|
@@ -20,11 +23,18 @@ from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
|
|
| 20 |
|
| 21 |
from auth import _validate_url, verify_token
|
| 22 |
from config import (
|
|
|
|
| 23 |
DOCLING_DEVICE,
|
| 24 |
DOCLING_NUM_THREADS,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
IMAGES_SCALE,
|
| 26 |
MAX_FILE_SIZE_BYTES,
|
| 27 |
MAX_FILE_SIZE_MB,
|
|
|
|
|
|
|
| 28 |
logger,
|
| 29 |
)
|
| 30 |
from models import HealthResponse, ParseResponse, URLParseRequest
|
|
@@ -41,27 +51,34 @@ from pipeline import (
|
|
| 41 |
async def lifespan(app: FastAPI):
|
| 42 |
"""Startup: initialize Docling converter."""
|
| 43 |
logger.info("=" * 60)
|
| 44 |
-
logger.info("Starting Docling
|
| 45 |
-
logger.info("Initializing Docling
|
| 46 |
_get_converter()
|
| 47 |
-
logger.info("Docling
|
| 48 |
|
|
|
|
| 49 |
logger.info(f"Images scale: {IMAGES_SCALE}")
|
| 50 |
logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
|
| 51 |
logger.info(f"Docling device: {DOCLING_DEVICE}")
|
| 52 |
logger.info(f"Docling threads: {DOCLING_NUM_THREADS}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
logger.info("=" * 60)
|
| 55 |
-
logger.info("Docling
|
| 56 |
logger.info("=" * 60)
|
| 57 |
yield
|
| 58 |
logger.info("Shutting down Docling VLM Parser API...")
|
| 59 |
|
| 60 |
|
| 61 |
app = FastAPI(
|
| 62 |
-
title="Docling
|
| 63 |
-
description="Docling
|
| 64 |
-
version="
|
| 65 |
lifespan=lifespan,
|
| 66 |
)
|
| 67 |
|
|
@@ -71,9 +88,9 @@ async def health_check() -> HealthResponse:
|
|
| 71 |
"""Health check endpoint."""
|
| 72 |
return HealthResponse(
|
| 73 |
status="healthy",
|
| 74 |
-
version="
|
| 75 |
-
model="Docling +
|
| 76 |
-
gemini_status="
|
| 77 |
images_scale=IMAGES_SCALE,
|
| 78 |
)
|
| 79 |
|
|
@@ -88,7 +105,7 @@ async def parse_document(
|
|
| 88 |
include_images: bool = Form(default=False, description="Include extracted images"),
|
| 89 |
_token: str = Depends(verify_token),
|
| 90 |
) -> ParseResponse:
|
| 91 |
-
"""Parse a document file using
|
| 92 |
request_id = str(uuid4())[:8]
|
| 93 |
start_time = time.time()
|
| 94 |
|
|
@@ -159,9 +176,9 @@ async def parse_document(
|
|
| 159 |
images_zip=images_zip,
|
| 160 |
image_count=image_count,
|
| 161 |
pages_processed=pages_processed,
|
| 162 |
-
device_used=
|
| 163 |
-
vlm_model="Docling +
|
| 164 |
-
gemini_page_count=
|
| 165 |
gemini_pages=gemini_pages,
|
| 166 |
)
|
| 167 |
except Exception as e:
|
|
@@ -177,7 +194,7 @@ async def parse_document_from_url(
|
|
| 177 |
request: URLParseRequest,
|
| 178 |
_token: str = Depends(verify_token),
|
| 179 |
) -> ParseResponse:
|
| 180 |
-
"""Parse a document from URL using
|
| 181 |
request_id = str(uuid4())[:8]
|
| 182 |
start_time = time.time()
|
| 183 |
|
|
@@ -245,14 +262,23 @@ async def parse_document_from_url(
|
|
| 245 |
images_zip=images_zip,
|
| 246 |
image_count=image_count,
|
| 247 |
pages_processed=pages_processed,
|
| 248 |
-
device_used=
|
| 249 |
-
vlm_model="Docling +
|
| 250 |
-
gemini_page_count=
|
| 251 |
gemini_pages=gemini_pages,
|
| 252 |
)
|
|
|
|
|
|
|
|
|
|
| 253 |
except Exception as e:
|
| 254 |
total_duration = time.time() - start_time
|
| 255 |
logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
|
| 256 |
return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
|
| 257 |
finally:
|
| 258 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Docling VLM Parser API v6.0.0
|
| 3 |
|
| 4 |
+
A FastAPI service using a Docling-first + Gemini hybrid architecture for
|
| 5 |
+
document parsing:
|
| 6 |
+
Pass 1: Docling on a PDF slice or full input (no OCR)
|
| 7 |
+
Pass 2 (API): Gemini on table pages and weak-text pages
|
| 8 |
+
Post: Cross-page artifact removal, table cleanup, deduplication
|
| 9 |
"""
|
| 10 |
|
| 11 |
import asyncio
|
|
|
|
| 23 |
|
| 24 |
from auth import _validate_url, verify_token
|
| 25 |
from config import (
|
| 26 |
+
BITMAP_AREA_THRESHOLD,
|
| 27 |
DOCLING_DEVICE,
|
| 28 |
DOCLING_NUM_THREADS,
|
| 29 |
+
GEMINI_API_KEY,
|
| 30 |
+
GEMINI_CONCURRENCY,
|
| 31 |
+
GEMINI_MODEL,
|
| 32 |
+
IMAGE_DOMINANT_THRESHOLD,
|
| 33 |
IMAGES_SCALE,
|
| 34 |
MAX_FILE_SIZE_BYTES,
|
| 35 |
MAX_FILE_SIZE_MB,
|
| 36 |
+
RENDER_DPI,
|
| 37 |
+
SPARSE_TEXT_THRESHOLD,
|
| 38 |
logger,
|
| 39 |
)
|
| 40 |
from models import HealthResponse, ParseResponse, URLParseRequest
|
|
|
|
| 51 |
async def lifespan(app: FastAPI):
|
| 52 |
"""Startup: initialize Docling converter."""
|
| 53 |
logger.info("=" * 60)
|
| 54 |
+
logger.info("Starting Docling VLM Parser API v6.0.0...")
|
| 55 |
+
logger.info("Initializing Docling converter...")
|
| 56 |
_get_converter()
|
| 57 |
+
logger.info("Docling converter ready")
|
| 58 |
|
| 59 |
+
logger.info(f"Render DPI: {RENDER_DPI}")
|
| 60 |
logger.info(f"Images scale: {IMAGES_SCALE}")
|
| 61 |
logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
|
| 62 |
logger.info(f"Docling device: {DOCLING_DEVICE}")
|
| 63 |
logger.info(f"Docling threads: {DOCLING_NUM_THREADS}")
|
| 64 |
+
logger.info(f"Bitmap area threshold: {BITMAP_AREA_THRESHOLD}")
|
| 65 |
+
logger.info(f"Sparse text threshold: {SPARSE_TEXT_THRESHOLD}")
|
| 66 |
+
logger.info(f"Image dominant threshold: {IMAGE_DOMINANT_THRESHOLD}")
|
| 67 |
+
logger.info(f"Gemini Model: {GEMINI_MODEL}")
|
| 68 |
+
logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
|
| 69 |
+
logger.info(f"Gemini Concurrency: {GEMINI_CONCURRENCY}")
|
| 70 |
|
| 71 |
logger.info("=" * 60)
|
| 72 |
+
logger.info("Docling VLM Parser API ready (Docling-first + Gemini hybrid)")
|
| 73 |
logger.info("=" * 60)
|
| 74 |
yield
|
| 75 |
logger.info("Shutting down Docling VLM Parser API...")
|
| 76 |
|
| 77 |
|
| 78 |
app = FastAPI(
|
| 79 |
+
title="Docling VLM Parser API",
|
| 80 |
+
description="Docling-first + Gemini hybrid parser",
|
| 81 |
+
version="6.0.0",
|
| 82 |
lifespan=lifespan,
|
| 83 |
)
|
| 84 |
|
|
|
|
| 88 |
"""Health check endpoint."""
|
| 89 |
return HealthResponse(
|
| 90 |
status="healthy",
|
| 91 |
+
version="6.0.0",
|
| 92 |
+
model="Docling + Gemini",
|
| 93 |
+
gemini_status="configured" if GEMINI_API_KEY else "not set",
|
| 94 |
images_scale=IMAGES_SCALE,
|
| 95 |
)
|
| 96 |
|
|
|
|
| 105 |
include_images: bool = Form(default=False, description="Include extracted images"),
|
| 106 |
_token: str = Depends(verify_token),
|
| 107 |
) -> ParseResponse:
|
| 108 |
+
"""Parse a document file using the hybrid parser."""
|
| 109 |
request_id = str(uuid4())[:8]
|
| 110 |
start_time = time.time()
|
| 111 |
|
|
|
|
| 176 |
images_zip=images_zip,
|
| 177 |
image_count=image_count,
|
| 178 |
pages_processed=pages_processed,
|
| 179 |
+
device_used="cpu",
|
| 180 |
+
vlm_model="Docling + Gemini",
|
| 181 |
+
gemini_page_count=len(gemini_pages),
|
| 182 |
gemini_pages=gemini_pages,
|
| 183 |
)
|
| 184 |
except Exception as e:
|
|
|
|
| 194 |
request: URLParseRequest,
|
| 195 |
_token: str = Depends(verify_token),
|
| 196 |
) -> ParseResponse:
|
| 197 |
+
"""Parse a document from URL using the hybrid parser."""
|
| 198 |
request_id = str(uuid4())[:8]
|
| 199 |
start_time = time.time()
|
| 200 |
|
|
|
|
| 262 |
images_zip=images_zip,
|
| 263 |
image_count=image_count,
|
| 264 |
pages_processed=pages_processed,
|
| 265 |
+
device_used="cpu",
|
| 266 |
+
vlm_model="Docling + Gemini",
|
| 267 |
+
gemini_page_count=len(gemini_pages),
|
| 268 |
gemini_pages=gemini_pages,
|
| 269 |
)
|
| 270 |
+
except httpx.HTTPError as e:
|
| 271 |
+
logger.error(f"[{request_id}] Download failed: {e}")
|
| 272 |
+
return ParseResponse(success=False, error=f"Failed to download file from URL (ref: {request_id})")
|
| 273 |
except Exception as e:
|
| 274 |
total_duration = time.time() - start_time
|
| 275 |
logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
|
| 276 |
return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
|
| 277 |
finally:
|
| 278 |
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
if __name__ == "__main__":
|
| 282 |
+
import uvicorn
|
| 283 |
+
|
| 284 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
pipeline.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Docling
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import io
|
|
@@ -6,39 +6,64 @@ import re
|
|
| 6 |
import shutil
|
| 7 |
import time
|
| 8 |
import zipfile
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
from typing import BinaryIO, Optional
|
| 11 |
|
|
|
|
| 12 |
from docling.datamodel.base_models import InputFormat
|
| 13 |
-
from docling.datamodel.document import PictureItem
|
| 14 |
from docling.datamodel.pipeline_options import (
|
| 15 |
AcceleratorOptions,
|
| 16 |
-
EasyOcrOptions,
|
| 17 |
PdfPipelineOptions,
|
| 18 |
TableFormerMode,
|
| 19 |
-
TableStructureOptions,
|
| 20 |
)
|
| 21 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 22 |
from pypdf import PdfReader, PdfWriter
|
| 23 |
|
| 24 |
-
from config import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
_converter = None
|
|
|
|
| 27 |
_INLINE_DATA_IMAGE = re.compile(r"!\[[^\]]*\]\(data:image/[^)]+\)", re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def _get_converter():
|
| 31 |
-
"""Get or create the global Docling converter instance
|
| 32 |
global _converter
|
| 33 |
if _converter is None:
|
| 34 |
pipeline_options = PdfPipelineOptions()
|
| 35 |
-
pipeline_options.do_ocr =
|
| 36 |
-
pipeline_options.ocr_options = EasyOcrOptions(lang=["en"])
|
| 37 |
pipeline_options.do_table_structure = True
|
| 38 |
-
pipeline_options.table_structure_options =
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
pipeline_options.images_scale = IMAGES_SCALE
|
| 43 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 44 |
device=DOCLING_DEVICE,
|
|
@@ -49,6 +74,7 @@ def _get_converter():
|
|
| 49 |
format_options={
|
| 50 |
InputFormat.PDF: PdfFormatOption(
|
| 51 |
pipeline_options=pipeline_options,
|
|
|
|
| 52 |
)
|
| 53 |
}
|
| 54 |
)
|
|
@@ -60,13 +86,11 @@ def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
|
|
| 60 |
shutil.copyfileobj(file_obj, f)
|
| 61 |
|
| 62 |
|
| 63 |
-
|
| 64 |
def _save_downloaded_content(input_path: Path, content: bytes) -> None:
|
| 65 |
with open(input_path, "wb") as f:
|
| 66 |
f.write(content)
|
| 67 |
|
| 68 |
|
| 69 |
-
|
| 70 |
def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
| 71 |
"""Create a zip file from extracted images."""
|
| 72 |
image_dir = output_dir / "images"
|
|
@@ -90,13 +114,62 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
|
| 90 |
return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
|
| 91 |
|
| 92 |
|
| 93 |
-
|
| 94 |
def _resolve_pdf_page_count(input_path: Path) -> int:
|
| 95 |
from pdf2image.pdf2image import pdfinfo_from_path
|
| 96 |
|
| 97 |
return int(pdfinfo_from_path(str(input_path))["Pages"])
|
| 98 |
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
def _prepare_input_slice(
|
| 102 |
input_path: Path,
|
|
@@ -136,6 +209,265 @@ def _prepare_input_slice(
|
|
| 136 |
return slice_path, start_page, last_page, requested_pages
|
| 137 |
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
|
| 141 |
"""Save Docling picture images to output dir."""
|
|
@@ -160,14 +492,6 @@ def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
|
|
| 160 |
return image_count
|
| 161 |
|
| 162 |
|
| 163 |
-
|
| 164 |
-
def _clean_markdown(markdown: str) -> str:
|
| 165 |
-
markdown = _INLINE_DATA_IMAGE.sub("", markdown)
|
| 166 |
-
markdown = re.sub(r"\n{3,}", "\n\n", markdown)
|
| 167 |
-
return markdown.strip()
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
def _convert_document(
|
| 172 |
input_path: Path,
|
| 173 |
output_dir: Path,
|
|
@@ -176,34 +500,146 @@ def _convert_document(
|
|
| 176 |
start_page: int = 0,
|
| 177 |
end_page: Optional[int] = None,
|
| 178 |
) -> tuple:
|
| 179 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
overall_start = time.time()
|
| 181 |
working_input, page_offset, resolved_end_page, requested_pages = _prepare_input_slice(
|
| 182 |
input_path, output_dir, request_id, start_page, end_page
|
| 183 |
)
|
| 184 |
|
| 185 |
converter = _get_converter()
|
| 186 |
-
|
| 187 |
result = converter.convert(working_input)
|
| 188 |
doc = result.document
|
| 189 |
if doc is None:
|
| 190 |
raise ValueError(
|
| 191 |
f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
|
| 192 |
)
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
-
|
|
|
|
|
|
|
| 196 |
|
| 197 |
image_count = 0
|
| 198 |
if include_images:
|
| 199 |
image_count = _save_docling_images(doc, output_dir, request_id)
|
| 200 |
|
| 201 |
-
|
|
|
|
| 202 |
logger.info(
|
| 203 |
-
f"[{request_id}] Docling
|
| 204 |
-
f"Docling {
|
|
|
|
| 205 |
)
|
| 206 |
-
if
|
| 207 |
-
logger.info(f"[{request_id}] Speed: {
|
| 208 |
|
| 209 |
-
return markdown_content, None,
|
|
|
|
| 1 |
+
"""Docling-first pipeline, Gemini routing, and file helpers."""
|
| 2 |
|
| 3 |
import base64
|
| 4 |
import io
|
|
|
|
| 6 |
import shutil
|
| 7 |
import time
|
| 8 |
import zipfile
|
| 9 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import BinaryIO, Optional
|
| 12 |
|
| 13 |
+
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
| 14 |
from docling.datamodel.base_models import InputFormat
|
| 15 |
+
from docling.datamodel.document import PictureItem, TableItem
|
| 16 |
from docling.datamodel.pipeline_options import (
|
| 17 |
AcceleratorOptions,
|
|
|
|
| 18 |
PdfPipelineOptions,
|
| 19 |
TableFormerMode,
|
|
|
|
| 20 |
)
|
| 21 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
| 22 |
from pypdf import PdfReader, PdfWriter
|
| 23 |
|
| 24 |
+
from config import (
|
| 25 |
+
BITMAP_AREA_THRESHOLD,
|
| 26 |
+
DOCLING_DEVICE,
|
| 27 |
+
DOCLING_NUM_THREADS,
|
| 28 |
+
GEMINI_API_KEY,
|
| 29 |
+
GEMINI_CONCURRENCY,
|
| 30 |
+
GEMINI_MODEL,
|
| 31 |
+
IMAGE_DOMINANT_THRESHOLD,
|
| 32 |
+
IMAGES_SCALE,
|
| 33 |
+
SPARSE_TEXT_THRESHOLD,
|
| 34 |
+
logger,
|
| 35 |
+
)
|
| 36 |
+
from gemini import _gemini_extract_page
|
| 37 |
+
from postprocess import _post_process_merged_markdown
|
| 38 |
+
from rendering import _pdf_to_page_images
|
| 39 |
|
| 40 |
_converter = None
|
| 41 |
+
_PAGE_MARKER = re.compile(r"^\s*---\s*Page\s+\d+\s*---\s*$", re.MULTILINE)
|
| 42 |
_INLINE_DATA_IMAGE = re.compile(r"!\[[^\]]*\]\(data:image/[^)]+\)", re.IGNORECASE)
|
| 43 |
+
_ABOUT_BLANK_LINE = re.compile(r"^\s*about:blank(?:\s+\d+/\d+)?\s*$", re.IGNORECASE)
|
| 44 |
+
_BROWSER_PRINT_LINE = re.compile(
|
| 45 |
+
r"^\s*\d{1,2}/\d{1,2}/\d{2,4},\s+\d{1,2}:\d{2}\s*(?:AM|PM)\b.*$",
|
| 46 |
+
re.IGNORECASE,
|
| 47 |
+
)
|
| 48 |
+
_PAGE_COUNTER_LINE = re.compile(r"^\s*\d+\s*/\s*\d+\s*$")
|
| 49 |
+
_URLISH_LINE = re.compile(r"^\s*(?:https?://|www\.)\S+\s*$", re.IGNORECASE)
|
| 50 |
+
_SHORT_ARTIFACT_LINE = re.compile(
|
| 51 |
+
r"^\s*(?:printed from|generated by|page \d+ of \d+|page \d+)\s*$",
|
| 52 |
+
re.IGNORECASE,
|
| 53 |
+
)
|
| 54 |
|
| 55 |
|
| 56 |
def _get_converter():
|
| 57 |
+
"""Get or create the global Docling converter instance."""
|
| 58 |
global _converter
|
| 59 |
if _converter is None:
|
| 60 |
pipeline_options = PdfPipelineOptions()
|
| 61 |
+
pipeline_options.do_ocr = False
|
|
|
|
| 62 |
pipeline_options.do_table_structure = True
|
| 63 |
+
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
|
| 64 |
+
pipeline_options.table_structure_options.do_cell_matching = True
|
| 65 |
+
pipeline_options.generate_page_images = False
|
| 66 |
+
pipeline_options.generate_picture_images = True
|
| 67 |
pipeline_options.images_scale = IMAGES_SCALE
|
| 68 |
pipeline_options.accelerator_options = AcceleratorOptions(
|
| 69 |
device=DOCLING_DEVICE,
|
|
|
|
| 74 |
format_options={
|
| 75 |
InputFormat.PDF: PdfFormatOption(
|
| 76 |
pipeline_options=pipeline_options,
|
| 77 |
+
backend=DoclingParseDocumentBackend,
|
| 78 |
)
|
| 79 |
}
|
| 80 |
)
|
|
|
|
| 86 |
shutil.copyfileobj(file_obj, f)
|
| 87 |
|
| 88 |
|
|
|
|
| 89 |
def _save_downloaded_content(input_path: Path, content: bytes) -> None:
|
| 90 |
with open(input_path, "wb") as f:
|
| 91 |
f.write(content)
|
| 92 |
|
| 93 |
|
|
|
|
| 94 |
def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
| 95 |
"""Create a zip file from extracted images."""
|
| 96 |
image_dir = output_dir / "images"
|
|
|
|
| 114 |
return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
|
| 115 |
|
| 116 |
|
|
|
|
| 117 |
def _resolve_pdf_page_count(input_path: Path) -> int:
|
| 118 |
from pdf2image.pdf2image import pdfinfo_from_path
|
| 119 |
|
| 120 |
return int(pdfinfo_from_path(str(input_path))["Pages"])
|
| 121 |
|
| 122 |
|
| 123 |
+
def _extract_pdf_text_by_page(input_path: Path) -> dict[int, str]:
|
| 124 |
+
"""Extract native PDF text per page using pypdf as a routing fallback."""
|
| 125 |
+
if input_path.suffix.lower() != ".pdf":
|
| 126 |
+
return {}
|
| 127 |
+
|
| 128 |
+
text_by_page: dict[int, str] = {}
|
| 129 |
+
try:
|
| 130 |
+
reader = PdfReader(str(input_path))
|
| 131 |
+
for page_no, page in enumerate(reader.pages):
|
| 132 |
+
try:
|
| 133 |
+
text_by_page[page_no] = (page.extract_text() or "").strip()
|
| 134 |
+
except Exception:
|
| 135 |
+
text_by_page[page_no] = ""
|
| 136 |
+
except Exception:
|
| 137 |
+
return {}
|
| 138 |
+
return text_by_page
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def _extract_pdf_page_signals(input_path: Path) -> dict[int, dict[str, bool]]:
|
| 142 |
+
"""Inspect PDF structure per page for native text and image-backed pages."""
|
| 143 |
+
if input_path.suffix.lower() != ".pdf":
|
| 144 |
+
return {}
|
| 145 |
+
|
| 146 |
+
page_signals: dict[int, dict[str, bool]] = {}
|
| 147 |
+
try:
|
| 148 |
+
reader = PdfReader(str(input_path))
|
| 149 |
+
for page_no, page in enumerate(reader.pages):
|
| 150 |
+
has_fonts = False
|
| 151 |
+
image_count = 0
|
| 152 |
+
try:
|
| 153 |
+
resources = page.get("/Resources") or {}
|
| 154 |
+
font_resources = resources.get("/Font")
|
| 155 |
+
has_fonts = bool(font_resources)
|
| 156 |
+
xobjects = resources.get("/XObject") or {}
|
| 157 |
+
for xobj in xobjects.values():
|
| 158 |
+
subtype = xobj.get("/Subtype")
|
| 159 |
+
if subtype == "/Image":
|
| 160 |
+
image_count += 1
|
| 161 |
+
except Exception:
|
| 162 |
+
pass
|
| 163 |
+
page_signals[page_no] = {
|
| 164 |
+
"has_fonts": has_fonts,
|
| 165 |
+
"has_images": image_count > 0,
|
| 166 |
+
"image_count": image_count,
|
| 167 |
+
"image_only_pdf_page": image_count > 0 and not has_fonts,
|
| 168 |
+
}
|
| 169 |
+
except Exception:
|
| 170 |
+
return {}
|
| 171 |
+
return page_signals
|
| 172 |
+
|
| 173 |
|
| 174 |
def _prepare_input_slice(
|
| 175 |
input_path: Path,
|
|
|
|
| 209 |
return slice_path, start_page, last_page, requested_pages
|
| 210 |
|
| 211 |
|
| 212 |
+
def _bitmap_coverage(page) -> float:
|
| 213 |
+
"""Compute bitmap coverage ratio for a Docling page."""
|
| 214 |
+
try:
|
| 215 |
+
if page._backend is None or page.size is None:
|
| 216 |
+
return 0.0
|
| 217 |
+
bitmap_rects = page._backend.get_bitmap_rects()
|
| 218 |
+
if not bitmap_rects:
|
| 219 |
+
return 0.0
|
| 220 |
+
page_area = page.size.width * page.size.height
|
| 221 |
+
if page_area <= 0:
|
| 222 |
+
return 0.0
|
| 223 |
+
bitmap_area = sum(max(0.0, rect.area()) for rect in bitmap_rects)
|
| 224 |
+
return min(1.0, bitmap_area / page_area)
|
| 225 |
+
except Exception:
|
| 226 |
+
return 0.0
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _page_has_native_text(page) -> bool:
|
| 230 |
+
"""Check whether Docling extracted meaningful native text on a page."""
|
| 231 |
+
try:
|
| 232 |
+
return any(
|
| 233 |
+
getattr(cell, "text", "").strip() and not getattr(cell, "from_ocr", False)
|
| 234 |
+
for cell in page.cells
|
| 235 |
+
)
|
| 236 |
+
except Exception:
|
| 237 |
+
return False
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _artifact_line_count(text: str) -> int:
|
| 241 |
+
count = 0
|
| 242 |
+
for line in text.splitlines():
|
| 243 |
+
stripped = line.strip()
|
| 244 |
+
if not stripped:
|
| 245 |
+
continue
|
| 246 |
+
if (
|
| 247 |
+
_ABOUT_BLANK_LINE.match(stripped)
|
| 248 |
+
or _BROWSER_PRINT_LINE.match(stripped)
|
| 249 |
+
or _PAGE_COUNTER_LINE.match(stripped)
|
| 250 |
+
or _URLISH_LINE.match(stripped)
|
| 251 |
+
or _SHORT_ARTIFACT_LINE.match(stripped)
|
| 252 |
+
):
|
| 253 |
+
count += 1
|
| 254 |
+
return count
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def _clean_extracted_text(text: str) -> str:
|
| 258 |
+
cleaned_lines: list[str] = []
|
| 259 |
+
for line in text.splitlines():
|
| 260 |
+
stripped = line.strip()
|
| 261 |
+
if not stripped:
|
| 262 |
+
continue
|
| 263 |
+
if (
|
| 264 |
+
_ABOUT_BLANK_LINE.match(stripped)
|
| 265 |
+
or _BROWSER_PRINT_LINE.match(stripped)
|
| 266 |
+
or _PAGE_COUNTER_LINE.match(stripped)
|
| 267 |
+
or _URLISH_LINE.match(stripped)
|
| 268 |
+
or _SHORT_ARTIFACT_LINE.match(stripped)
|
| 269 |
+
):
|
| 270 |
+
continue
|
| 271 |
+
cleaned_lines.append(stripped)
|
| 272 |
+
return "\n".join(cleaned_lines).strip()
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def _repetition_ratio(text: str) -> float:
|
| 276 |
+
lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
|
| 277 |
+
if not lines:
|
| 278 |
+
return 0.0
|
| 279 |
+
unique = len(set(lines))
|
| 280 |
+
return 1.0 - (unique / len(lines))
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def _looks_meaningful_text(cleaned_text: str) -> bool:
|
| 284 |
+
compact = re.sub(r"\s+", "", cleaned_text)
|
| 285 |
+
words = re.findall(r"[A-Za-z][A-Za-z0-9'&/-]*", cleaned_text)
|
| 286 |
+
return len(compact) >= 120 or len(set(w.lower() for w in words)) >= 20
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def _build_page_markdown(doc, page_no: int, elements_by_page: dict[int, list]) -> str:
|
| 290 |
+
"""Build markdown for one page from Docling items grouped by provenance."""
|
| 291 |
+
parts: list[str] = []
|
| 292 |
+
for element in elements_by_page.get(page_no, []):
|
| 293 |
+
if isinstance(element, PictureItem):
|
| 294 |
+
continue
|
| 295 |
+
try:
|
| 296 |
+
parts.append(element.export_to_markdown(doc=doc))
|
| 297 |
+
except Exception:
|
| 298 |
+
text = getattr(element, "text", "").strip()
|
| 299 |
+
if text:
|
| 300 |
+
parts.append(text)
|
| 301 |
+
page_md = "\n\n".join(p for p in parts if p and p.strip())
|
| 302 |
+
page_md = _INLINE_DATA_IMAGE.sub("", page_md)
|
| 303 |
+
return _PAGE_MARKER.sub("", page_md).strip()
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def _extract_page_markdowns(doc, page_count: int, elements_by_page: dict[int, list]) -> dict[int, str]:
|
| 307 |
+
"""Build per-page markdown strictly from Docling provenance."""
|
| 308 |
+
return {
|
| 309 |
+
page_no: _build_page_markdown(doc, page_no, elements_by_page)
|
| 310 |
+
for page_no in range(page_count)
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def _normalize_docling_page_no(page) -> int:
|
| 315 |
+
"""Normalize Docling page numbering to a zero-based index."""
|
| 316 |
+
raw_page_no = int(page.page_no)
|
| 317 |
+
return raw_page_no - 1 if raw_page_no > 0 else raw_page_no
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def _table_pages(doc) -> set[int]:
|
| 321 |
+
"""Collect Docling-detected table page numbers."""
|
| 322 |
+
pages: set[int] = set()
|
| 323 |
+
for element, _ in doc.iterate_items():
|
| 324 |
+
if isinstance(element, TableItem) and element.prov:
|
| 325 |
+
prov_page_no = int(element.prov[0].page_no)
|
| 326 |
+
pages.add(prov_page_no - 1 if prov_page_no > 0 else prov_page_no)
|
| 327 |
+
return pages
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def _picture_pages(doc) -> set[int]:
|
| 331 |
+
"""Collect pages containing Docling picture items."""
|
| 332 |
+
pages: set[int] = set()
|
| 333 |
+
for element, _ in doc.iterate_items():
|
| 334 |
+
if isinstance(element, PictureItem) and element.prov:
|
| 335 |
+
prov_page_no = int(element.prov[0].page_no)
|
| 336 |
+
pages.add(prov_page_no - 1 if prov_page_no > 0 else prov_page_no)
|
| 337 |
+
return pages
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def _build_elements_by_page(doc) -> dict[int, list]:
|
| 341 |
+
"""Index Docling elements by page number."""
|
| 342 |
+
elements_by_page: dict[int, list] = {}
|
| 343 |
+
for element, _ in doc.iterate_items():
|
| 344 |
+
if element.prov:
|
| 345 |
+
prov_page_no = int(element.prov[0].page_no)
|
| 346 |
+
page_no = prov_page_no - 1 if prov_page_no > 0 else prov_page_no
|
| 347 |
+
elements_by_page.setdefault(page_no, []).append(element)
|
| 348 |
+
return elements_by_page
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def _routing_decision(
|
| 352 |
+
page_no: int,
|
| 353 |
+
page_markdown: str,
|
| 354 |
+
pdf_text: str,
|
| 355 |
+
page_pdf_signals: dict[str, bool],
|
| 356 |
+
page,
|
| 357 |
+
table_pages: set[int],
|
| 358 |
+
picture_pages: set[int],
|
| 359 |
+
) -> tuple[str, list[str], dict]:
|
| 360 |
+
"""Decide whether a page should use Docling or Gemini."""
|
| 361 |
+
bitmap_coverage = _bitmap_coverage(page)
|
| 362 |
+
image_only_pdf_page = page_pdf_signals.get("image_only_pdf_page", False)
|
| 363 |
+
image_count = int(page_pdf_signals.get("image_count", 0) or 0)
|
| 364 |
+
has_images = bool(page_pdf_signals.get("has_images", False))
|
| 365 |
+
effective_bitmap_coverage = max(bitmap_coverage, 1.0 if image_only_pdf_page else 0.0)
|
| 366 |
+
image_dominant = bitmap_coverage >= IMAGE_DOMINANT_THRESHOLD
|
| 367 |
+
pdf_text_len = len(re.sub(r"\s+", "", pdf_text))
|
| 368 |
+
docling_native_text = _page_has_native_text(page)
|
| 369 |
+
cleaned_docling_text = _clean_extracted_text(page_markdown)
|
| 370 |
+
cleaned_pdf_text = _clean_extracted_text(pdf_text)
|
| 371 |
+
cleaned_docling_text_len = len(re.sub(r"\s+", "", cleaned_docling_text))
|
| 372 |
+
cleaned_pdf_text_len = len(re.sub(r"\s+", "", cleaned_pdf_text))
|
| 373 |
+
raw_routing_text = page_markdown if page_markdown.strip() else pdf_text
|
| 374 |
+
routing_text = raw_routing_text
|
| 375 |
+
artifact_lines = _artifact_line_count(routing_text)
|
| 376 |
+
nonempty_lines = max(1, len([line for line in routing_text.splitlines() if line.strip()]))
|
| 377 |
+
artifact_line_ratio = artifact_lines / nonempty_lines
|
| 378 |
+
repetition_ratio = _repetition_ratio(routing_text)
|
| 379 |
+
meaningful_text_present = (
|
| 380 |
+
_looks_meaningful_text(cleaned_docling_text)
|
| 381 |
+
or _looks_meaningful_text(cleaned_pdf_text)
|
| 382 |
+
)
|
| 383 |
+
native_text_present = docling_native_text or meaningful_text_present
|
| 384 |
+
docling_text_len = len(re.sub(r"\s+", "", routing_text))
|
| 385 |
+
page_empty = docling_text_len == 0
|
| 386 |
+
table_like_page = _looks_table_like(routing_text)
|
| 387 |
+
picture_page = page_no in picture_pages
|
| 388 |
+
junk_text_heavy = artifact_line_ratio >= 0.5 or repetition_ratio >= 0.45
|
| 389 |
+
image_heavy_page = (
|
| 390 |
+
image_only_pdf_page
|
| 391 |
+
or image_dominant
|
| 392 |
+
or effective_bitmap_coverage > BITMAP_AREA_THRESHOLD
|
| 393 |
+
or image_count >= 3
|
| 394 |
+
or (has_images and not meaningful_text_present)
|
| 395 |
+
)
|
| 396 |
+
low_quality_docling_output = (
|
| 397 |
+
page_empty
|
| 398 |
+
or junk_text_heavy
|
| 399 |
+
or (docling_text_len < SPARSE_TEXT_THRESHOLD and has_images and not meaningful_text_present)
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
reasons: list[str] = []
|
| 403 |
+
if page_no in table_pages:
|
| 404 |
+
reasons.append("table_page")
|
| 405 |
+
if table_like_page:
|
| 406 |
+
reasons.append("table_like_page")
|
| 407 |
+
if picture_page and not meaningful_text_present:
|
| 408 |
+
reasons.append("picture_without_native_text")
|
| 409 |
+
if page_empty:
|
| 410 |
+
reasons.append("docling_empty")
|
| 411 |
+
if image_heavy_page and not meaningful_text_present:
|
| 412 |
+
reasons.append("image_heavy_weak_text")
|
| 413 |
+
if junk_text_heavy and has_images:
|
| 414 |
+
reasons.append("junk_text_with_images")
|
| 415 |
+
if low_quality_docling_output and has_images:
|
| 416 |
+
reasons.append("low_quality_docling_output")
|
| 417 |
+
|
| 418 |
+
route = "gemini" if reasons else "docling"
|
| 419 |
+
metrics = {
|
| 420 |
+
"bitmap_coverage": round(bitmap_coverage, 4),
|
| 421 |
+
"effective_bitmap_coverage": round(effective_bitmap_coverage, 4),
|
| 422 |
+
"image_count": image_count,
|
| 423 |
+
"docling_text_len": docling_text_len,
|
| 424 |
+
"cleaned_docling_text_len": cleaned_docling_text_len,
|
| 425 |
+
"pdf_text_len": pdf_text_len,
|
| 426 |
+
"cleaned_pdf_text_len": cleaned_pdf_text_len,
|
| 427 |
+
"native_text_present": native_text_present,
|
| 428 |
+
"meaningful_text_present": meaningful_text_present,
|
| 429 |
+
"artifact_line_ratio": round(artifact_line_ratio, 4),
|
| 430 |
+
"repetition_ratio": round(repetition_ratio, 4),
|
| 431 |
+
"junk_text_heavy": junk_text_heavy,
|
| 432 |
+
"image_dominant": image_dominant,
|
| 433 |
+
"image_heavy_page": image_heavy_page,
|
| 434 |
+
"image_only_pdf_page": image_only_pdf_page,
|
| 435 |
+
"picture_page": picture_page,
|
| 436 |
+
"table_page": page_no in table_pages,
|
| 437 |
+
"table_like_page": table_like_page,
|
| 438 |
+
"page_empty": page_empty,
|
| 439 |
+
}
|
| 440 |
+
return route, reasons, metrics
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
def _looks_table_like(page_markdown: str) -> bool:
|
| 444 |
+
"""Heuristic detector for tabular pages when Docling doesn't emit TableItem."""
|
| 445 |
+
lines = [line.strip() for line in page_markdown.splitlines() if line.strip()]
|
| 446 |
+
if len(lines) < 4:
|
| 447 |
+
return False
|
| 448 |
+
|
| 449 |
+
header_terms = {
|
| 450 |
+
"code", "name", "avg", "sq", "sq.", "sqft", "rent", "units", "occupied",
|
| 451 |
+
"vacant", "notice", "leased", "model", "admin", "trend", "availability",
|
| 452 |
+
"date", "rate", "%", "unit", "floor", "suite",
|
| 453 |
+
}
|
| 454 |
+
header_hits = 0
|
| 455 |
+
for line in lines[:6]:
|
| 456 |
+
tokens = re.findall(r"[A-Za-z%\.]+", line.lower())
|
| 457 |
+
if len(header_terms.intersection(tokens)) >= 4:
|
| 458 |
+
header_hits += 1
|
| 459 |
+
|
| 460 |
+
numeric_dense_lines = 0
|
| 461 |
+
for line in lines:
|
| 462 |
+
numbers = re.findall(r"\d+(?:[.,]\d+)?%?", line)
|
| 463 |
+
words = re.findall(r"[A-Za-z][A-Za-z\-\/&]*", line)
|
| 464 |
+
if len(numbers) >= 4 and len(words) >= 2:
|
| 465 |
+
numeric_dense_lines += 1
|
| 466 |
+
|
| 467 |
+
long_single_block = any(len(line) > 250 and len(re.findall(r"\d+(?:[.,]\d+)?", line)) >= 10 for line in lines)
|
| 468 |
+
|
| 469 |
+
return header_hits >= 1 and (numeric_dense_lines >= 2 or long_single_block)
|
| 470 |
+
|
| 471 |
|
| 472 |
def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
|
| 473 |
"""Save Docling picture images to output dir."""
|
|
|
|
| 492 |
return image_count
|
| 493 |
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
def _convert_document(
|
| 496 |
input_path: Path,
|
| 497 |
output_dir: Path,
|
|
|
|
| 500 |
start_page: int = 0,
|
| 501 |
end_page: Optional[int] = None,
|
| 502 |
) -> tuple:
|
| 503 |
+
"""
|
| 504 |
+
Docling-first + Gemini hybrid conversion.
|
| 505 |
+
|
| 506 |
+
Flow:
|
| 507 |
+
- Docling parses the requested PDF slice with OCR disabled.
|
| 508 |
+
- Pages route to Gemini when they are table pages or Docling output is too weak.
|
| 509 |
+
- One parser wins per page; post-processing happens only after merge.
|
| 510 |
+
"""
|
| 511 |
overall_start = time.time()
|
| 512 |
working_input, page_offset, resolved_end_page, requested_pages = _prepare_input_slice(
|
| 513 |
input_path, output_dir, request_id, start_page, end_page
|
| 514 |
)
|
| 515 |
|
| 516 |
converter = _get_converter()
|
| 517 |
+
docling_start = time.time()
|
| 518 |
result = converter.convert(working_input)
|
| 519 |
doc = result.document
|
| 520 |
if doc is None:
|
| 521 |
raise ValueError(
|
| 522 |
f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
|
| 523 |
)
|
| 524 |
+
docling_time = time.time() - docling_start
|
| 525 |
+
|
| 526 |
+
elements_by_page = _build_elements_by_page(doc)
|
| 527 |
+
table_pages = _table_pages(doc)
|
| 528 |
+
picture_pages = _picture_pages(doc)
|
| 529 |
+
|
| 530 |
+
page_markdowns = _extract_page_markdowns(doc, len(result.pages), elements_by_page)
|
| 531 |
+
pdf_text_by_page = _extract_pdf_text_by_page(working_input)
|
| 532 |
+
pdf_page_signals = _extract_pdf_page_signals(working_input)
|
| 533 |
+
routes: dict[int, tuple[str, list[str], dict]] = {}
|
| 534 |
+
for page in result.pages:
|
| 535 |
+
page_no = _normalize_docling_page_no(page)
|
| 536 |
+
page_md = page_markdowns.get(page_no, "")
|
| 537 |
+
pdf_text = pdf_text_by_page.get(page_no, "")
|
| 538 |
+
page_pdf_signals = pdf_page_signals.get(page_no, {})
|
| 539 |
+
if not page_md and pdf_text:
|
| 540 |
+
page_markdowns[page_no] = pdf_text
|
| 541 |
+
page_md = pdf_text
|
| 542 |
+
routes[page_no] = _routing_decision(
|
| 543 |
+
page_no,
|
| 544 |
+
page_md,
|
| 545 |
+
pdf_text,
|
| 546 |
+
page_pdf_signals,
|
| 547 |
+
page,
|
| 548 |
+
table_pages,
|
| 549 |
+
picture_pages,
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
gemini_targets = [page_no for page_no, (route, _, _) in routes.items() if route == "gemini"]
|
| 553 |
+
gemini_target_pages = [page_offset + page_no + 1 for page_no in gemini_targets]
|
| 554 |
+
|
| 555 |
+
logger.info(
|
| 556 |
+
f"[{request_id}] Pass 1: Docling processed {len(page_markdowns)} pages in {docling_time:.2f}s; "
|
| 557 |
+
f"table pages: {len(table_pages)}; gemini targets: {len(gemini_targets)}; "
|
| 558 |
+
f"requested range: {page_offset}-{resolved_end_page if resolved_end_page is not None else 'end'}"
|
| 559 |
+
)
|
| 560 |
+
|
| 561 |
+
for page_no in sorted(routes):
|
| 562 |
+
route, reasons, metrics = routes[page_no]
|
| 563 |
+
logger.info(
|
| 564 |
+
f"[{request_id}] Route page {page_offset + page_no + 1}: {route}; "
|
| 565 |
+
f"reasons={reasons or ['docling_default']}; metrics={metrics}"
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
gemini_page_texts: dict[int, str] = {}
|
| 569 |
+
render_time = 0.0
|
| 570 |
+
gemini_time = 0.0
|
| 571 |
+
|
| 572 |
+
if gemini_targets and GEMINI_API_KEY:
|
| 573 |
+
render_start = time.time()
|
| 574 |
+
page_images = _pdf_to_page_images(working_input, request_id, 0, None)
|
| 575 |
+
render_time = time.time() - render_start
|
| 576 |
+
page_image_map = {pno: pbytes for pno, pbytes in page_images}
|
| 577 |
+
|
| 578 |
+
logger.info(
|
| 579 |
+
f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(gemini_targets)} routed pages "
|
| 580 |
+
f"({GEMINI_CONCURRENCY} concurrent)"
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
gemini_start = time.time()
|
| 584 |
+
with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
|
| 585 |
+
futures = {
|
| 586 |
+
executor.submit(
|
| 587 |
+
_gemini_extract_page,
|
| 588 |
+
page_image_map[pno],
|
| 589 |
+
request_id,
|
| 590 |
+
page_offset + pno,
|
| 591 |
+
): pno
|
| 592 |
+
for pno in gemini_targets
|
| 593 |
+
if pno in page_image_map
|
| 594 |
+
}
|
| 595 |
+
for future in as_completed(futures):
|
| 596 |
+
pno = futures[future]
|
| 597 |
+
try:
|
| 598 |
+
text = future.result()
|
| 599 |
+
if text:
|
| 600 |
+
gemini_page_texts[pno] = text.strip()
|
| 601 |
+
logger.info(
|
| 602 |
+
f"[{request_id}] Gemini processed page {page_offset + pno + 1} "
|
| 603 |
+
f"({len(text)} chars)"
|
| 604 |
+
)
|
| 605 |
+
except Exception as e:
|
| 606 |
+
logger.warning(
|
| 607 |
+
f"[{request_id}] Gemini failed page {page_offset + pno + 1}: {e}; "
|
| 608 |
+
"falling back to Docling page output"
|
| 609 |
+
)
|
| 610 |
+
gemini_time = time.time() - gemini_start
|
| 611 |
+
elif gemini_targets and not GEMINI_API_KEY:
|
| 612 |
+
logger.warning(
|
| 613 |
+
f"[{request_id}] {len(gemini_targets)} pages routed to Gemini but GEMINI_API_KEY is not set; "
|
| 614 |
+
"falling back to Docling output"
|
| 615 |
+
)
|
| 616 |
+
|
| 617 |
+
merged_pages: list[str] = []
|
| 618 |
+
for page_no in sorted(page_markdowns):
|
| 619 |
+
page_label = page_offset + page_no + 1
|
| 620 |
+
content = gemini_page_texts.get(page_no, page_markdowns[page_no]).strip()
|
| 621 |
+
merged_pages.append(f"--- Page {page_label} ---")
|
| 622 |
+
if content:
|
| 623 |
+
merged_pages.append(content)
|
| 624 |
+
|
| 625 |
+
markdown_content = "\n\n".join(merged_pages).strip()
|
| 626 |
|
| 627 |
+
post_start = time.time()
|
| 628 |
+
markdown_content = _post_process_merged_markdown(markdown_content)
|
| 629 |
+
post_time = time.time() - post_start
|
| 630 |
|
| 631 |
image_count = 0
|
| 632 |
if include_images:
|
| 633 |
image_count = _save_docling_images(doc, output_dir, request_id)
|
| 634 |
|
| 635 |
+
pages_processed = len(page_markdowns) or requested_pages
|
| 636 |
+
total_time = time.time() - overall_start
|
| 637 |
logger.info(
|
| 638 |
+
f"[{request_id}] Docling+Gemini conversion complete: {pages_processed} pages; "
|
| 639 |
+
f"Docling {docling_time:.1f}s + Render {render_time:.1f}s + Gemini {gemini_time:.1f}s "
|
| 640 |
+
f"+ Post {post_time:.1f}s = {total_time:.2f}s total"
|
| 641 |
)
|
| 642 |
+
if pages_processed > 0:
|
| 643 |
+
logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
|
| 644 |
|
| 645 |
+
return markdown_content, None, pages_processed, image_count, gemini_target_pages
|
requirements.txt
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
-
# Docling
|
| 2 |
|
| 3 |
docling>=2.15.0
|
| 4 |
-
easyocr>=1.7.2
|
| 5 |
fastapi>=0.115.0
|
| 6 |
uvicorn[standard]>=0.32.0
|
| 7 |
python-multipart>=0.0.9
|
|
@@ -11,3 +10,4 @@ opencv-python-headless>=4.10.0
|
|
| 11 |
pdf2image>=1.17.0
|
| 12 |
huggingface-hub>=0.25.0
|
| 13 |
pypdf>=5.1.0
|
|
|
|
|
|
| 1 |
+
# Docling-first + Gemini Hybrid Parser API Dependencies
|
| 2 |
|
| 3 |
docling>=2.15.0
|
|
|
|
| 4 |
fastapi>=0.115.0
|
| 5 |
uvicorn[standard]>=0.32.0
|
| 6 |
python-multipart>=0.0.9
|
|
|
|
| 10 |
pdf2image>=1.17.0
|
| 11 |
huggingface-hub>=0.25.0
|
| 12 |
pypdf>=5.1.0
|
| 13 |
+
onnxruntime>=1.19.0
|