Spaces:
Running on T4
Running on T4
Commit ·
c67903b
1
Parent(s): 3f46c5e
feat: v3.0.0 VLM-first hybrid architecture — GPU VLM on all pages, Docling TableFormer only on table pages
Browse files- app.py +237 -98
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -1,18 +1,23 @@
|
|
| 1 |
"""
|
| 2 |
-
Docling VLM Parser API
|
| 3 |
|
| 4 |
-
A FastAPI service
|
| 5 |
-
Pass 1:
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
Features:
|
| 10 |
-
- GPU-accelerated
|
| 11 |
-
- TableFormer
|
| 12 |
-
-
|
| 13 |
-
- OpenCV image preprocessing (
|
| 14 |
- Image extraction with configurable resolution
|
| 15 |
-
- Automatic page chunking for large PDFs
|
| 16 |
"""
|
| 17 |
|
| 18 |
import asyncio
|
|
@@ -386,6 +391,60 @@ def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list) -> str:
|
|
| 386 |
return result
|
| 387 |
|
| 388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
# ---------------------------------------------------------------------------
|
| 390 |
# PDF to Page Images
|
| 391 |
# ---------------------------------------------------------------------------
|
|
@@ -503,56 +562,43 @@ def _convert_document(
|
|
| 503 |
end_page: Optional[int] = None,
|
| 504 |
) -> tuple:
|
| 505 |
"""
|
| 506 |
-
|
| 507 |
|
| 508 |
-
Pass 1:
|
| 509 |
-
|
| 510 |
-
|
|
|
|
| 511 |
|
| 512 |
Returns: (markdown_content, json_content, pages_processed, image_count)
|
| 513 |
"""
|
| 514 |
-
|
| 515 |
-
logger.info(
|
| 516 |
-
f"[{request_id}] Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR)"
|
| 517 |
-
)
|
| 518 |
-
converter = _get_converter()
|
| 519 |
-
|
| 520 |
-
start_time = time.time()
|
| 521 |
-
result = converter.convert(input_path)
|
| 522 |
-
doc = result.document
|
| 523 |
-
if doc is None:
|
| 524 |
-
raise ValueError(
|
| 525 |
-
f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
|
| 526 |
-
)
|
| 527 |
-
pass1_time = time.time() - start_time
|
| 528 |
-
logger.info(f"[{request_id}] Pass 1 completed in {pass1_time:.2f}s")
|
| 529 |
|
| 530 |
-
#
|
| 531 |
-
|
| 532 |
-
total_tables = sum(len(v) for v in tables_by_page.values())
|
| 533 |
-
logger.info(f"[{request_id}] TableFormer detected {total_tables} tables")
|
| 534 |
-
|
| 535 |
-
# PASS 2: VLM OCR (enhanced text per page)
|
| 536 |
-
logger.info(f"[{request_id}] Pass 2: VLM OCR via Qwen3-VL ({VLM_MODEL})")
|
| 537 |
-
|
| 538 |
-
# Get page images for VLM
|
| 539 |
page_images = _pdf_to_page_images(input_path, start_page, end_page)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
|
| 541 |
if not page_images:
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
)
|
| 548 |
-
|
|
|
|
|
|
|
| 549 |
|
| 550 |
vlm_page_texts: dict[int, Optional[str]] = {}
|
| 551 |
vlm_start = time.time()
|
| 552 |
|
| 553 |
-
# Process pages concurrently — vLLM supports batching via --max-num-seqs
|
| 554 |
max_workers = min(2, len(page_images))
|
| 555 |
-
logger.info(
|
|
|
|
|
|
|
| 556 |
|
| 557 |
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
| 558 |
futures = {
|
|
@@ -569,32 +615,99 @@ def _convert_document(
|
|
| 569 |
)
|
| 570 |
except Exception as e:
|
| 571 |
logger.warning(
|
| 572 |
-
f"[{request_id}] VLM failed on page {page_no + 1}: {e}
|
| 573 |
)
|
| 574 |
vlm_page_texts[page_no] = None
|
| 575 |
|
| 576 |
vlm_time = time.time() - vlm_start
|
| 577 |
logger.info(
|
| 578 |
-
f"[{request_id}] Pass
|
| 579 |
)
|
| 580 |
|
| 581 |
-
#
|
| 582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
|
| 584 |
-
|
| 585 |
-
|
|
|
|
| 586 |
image_count = 0
|
| 587 |
image_dir = output_dir / "images"
|
| 588 |
|
| 589 |
-
if
|
| 590 |
-
|
|
|
|
|
|
|
|
|
|
| 591 |
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 598 |
|
| 599 |
for page_no in sorted(vlm_page_texts.keys()):
|
| 600 |
pages_seen.add(page_no)
|
|
@@ -603,22 +716,60 @@ def _convert_document(
|
|
| 603 |
vlm_text = vlm_page_texts[page_no]
|
| 604 |
|
| 605 |
if vlm_text is None:
|
| 606 |
-
|
| 607 |
-
for element in elements_by_page.get(page_no, []):
|
| 608 |
-
try:
|
| 609 |
-
md_parts.append(element.export_to_markdown(doc=doc))
|
| 610 |
-
except Exception:
|
| 611 |
-
text = getattr(element, "text", "").strip()
|
| 612 |
-
if text:
|
| 613 |
-
md_parts.append(text + "\n\n")
|
| 614 |
else:
|
| 615 |
-
# Merge VLM text with TableFormer tables for this page
|
| 616 |
page_tables = tables_by_page.get(page_no, [])
|
| 617 |
-
|
| 618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
if include_images:
|
|
|
|
|
|
|
| 622 |
for element, _ in doc.iterate_items():
|
| 623 |
if isinstance(element, PictureItem):
|
| 624 |
if element.image and element.image.pil_image:
|
|
@@ -630,21 +781,8 @@ def _convert_document(
|
|
| 630 |
try:
|
| 631 |
element.image.pil_image.save(image_path, format="PNG")
|
| 632 |
image_count += 1
|
| 633 |
-
except Exception
|
| 634 |
-
|
| 635 |
-
f"[{request_id}] Failed to save image {image_name}: {e}"
|
| 636 |
-
)
|
| 637 |
-
|
| 638 |
-
markdown_content = "".join(md_parts)
|
| 639 |
-
pages_processed = len(pages_seen)
|
| 640 |
-
|
| 641 |
-
total_time = pass1_time + vlm_time
|
| 642 |
-
logger.info(
|
| 643 |
-
f"[{request_id}] Hybrid conversion complete: {pages_processed} pages, "
|
| 644 |
-
f"{total_tables} tables, {total_time:.2f}s total"
|
| 645 |
-
)
|
| 646 |
-
if pages_processed > 0:
|
| 647 |
-
logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
|
| 648 |
|
| 649 |
return markdown_content, None, pages_processed, image_count
|
| 650 |
|
|
@@ -688,7 +826,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
|
| 688 |
async def lifespan(app: FastAPI):
|
| 689 |
"""Startup: initialize Docling converter and check vLLM."""
|
| 690 |
logger.info("=" * 60)
|
| 691 |
-
logger.info("Starting Docling VLM Parser API
|
| 692 |
|
| 693 |
device = _get_device()
|
| 694 |
logger.info(f"Device: {device}")
|
|
@@ -725,7 +863,7 @@ async def lifespan(app: FastAPI):
|
|
| 725 |
logger.warning(f"Failed to pre-load Docling models: {e}")
|
| 726 |
|
| 727 |
logger.info("=" * 60)
|
| 728 |
-
logger.info("Docling VLM Parser API ready (
|
| 729 |
logger.info("=" * 60)
|
| 730 |
yield
|
| 731 |
logger.info("Shutting down Docling VLM Parser API...")
|
|
@@ -737,8 +875,8 @@ async def lifespan(app: FastAPI):
|
|
| 737 |
|
| 738 |
app = FastAPI(
|
| 739 |
title="Docling VLM Parser API",
|
| 740 |
-
description="
|
| 741 |
-
version="
|
| 742 |
lifespan=lifespan,
|
| 743 |
)
|
| 744 |
|
|
@@ -767,7 +905,7 @@ async def health_check() -> HealthResponse:
|
|
| 767 |
|
| 768 |
return HealthResponse(
|
| 769 |
status="healthy",
|
| 770 |
-
version="
|
| 771 |
device=device,
|
| 772 |
gpu_name=None, # Don't leak GPU details on unauthenticated endpoint
|
| 773 |
vlm_model="active", # Confirm VLM is configured without leaking model name
|
|
@@ -789,10 +927,11 @@ async def parse_document(
|
|
| 789 |
"""
|
| 790 |
Parse a document file (PDF or image) and return extracted content.
|
| 791 |
|
| 792 |
-
Uses a
|
| 793 |
-
Pass 1:
|
| 794 |
-
|
| 795 |
-
|
|
|
|
| 796 |
|
| 797 |
Supports:
|
| 798 |
- PDF files (.pdf)
|
|
@@ -810,7 +949,7 @@ async def parse_document(
|
|
| 810 |
if output_format not in ("markdown",):
|
| 811 |
raise HTTPException(
|
| 812 |
status_code=400,
|
| 813 |
-
detail="Only 'markdown' output_format is supported in
|
| 814 |
)
|
| 815 |
|
| 816 |
# Validate file size
|
|
@@ -934,7 +1073,7 @@ async def parse_document_from_url(
|
|
| 934 |
if request.output_format not in ("markdown",):
|
| 935 |
raise HTTPException(
|
| 936 |
status_code=400,
|
| 937 |
-
detail="Only 'markdown' output_format is supported in
|
| 938 |
)
|
| 939 |
|
| 940 |
# Validate URL
|
|
|
|
| 1 |
"""
|
| 2 |
+
Docling VLM Parser API v3.0.0
|
| 3 |
|
| 4 |
+
A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
|
| 5 |
+
Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
|
| 6 |
+
Detect: Identify pages with tables from VLM markdown output
|
| 7 |
+
Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
|
| 8 |
+
Merge: VLM text for all pages + TableFormer tables where detected
|
| 9 |
+
|
| 10 |
+
Key insight: the previous architecture ran Docling's full CPU pipeline (DocLayNet +
|
| 11 |
+
TableFormer + RapidOCR) on ALL pages, taking 60-565s. Most of that time was wasted
|
| 12 |
+
on non-table pages. Now we run the fast GPU VLM first, detect which pages have tables,
|
| 13 |
+
and only send those pages (as a mini-PDF) to Docling for table structure extraction.
|
| 14 |
|
| 15 |
Features:
|
| 16 |
+
- VLM-first: GPU-accelerated OCR on all pages via Qwen3-VL (concurrent)
|
| 17 |
+
- Targeted TableFormer: CPU pipeline runs only on pages with tables
|
| 18 |
+
- pypdf mini-PDF extraction for page-level Docling targeting
|
| 19 |
+
- OpenCV image preprocessing (denoise, CLAHE contrast enhancement)
|
| 20 |
- Image extraction with configurable resolution
|
|
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
import asyncio
|
|
|
|
| 391 |
return result
|
| 392 |
|
| 393 |
|
| 394 |
+
# ---------------------------------------------------------------------------
|
| 395 |
+
# Table Detection from VLM Output
|
| 396 |
+
# ---------------------------------------------------------------------------
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
|
| 400 |
+
"""Detect pages containing tables from VLM markdown output.
|
| 401 |
+
|
| 402 |
+
Looks for markdown table separator rows (e.g., | --- | --- |) which are
|
| 403 |
+
a reliable signal of table content. Returns set of 0-indexed page numbers.
|
| 404 |
+
"""
|
| 405 |
+
# Markdown table separator: | --- | --- | (with optional colons for alignment)
|
| 406 |
+
separator_pattern = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE)
|
| 407 |
+
table_pages: set[int] = set()
|
| 408 |
+
for page_no, text in vlm_page_texts.items():
|
| 409 |
+
if text and separator_pattern.search(text):
|
| 410 |
+
table_pages.add(page_no)
|
| 411 |
+
return table_pages
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def _extract_pages_to_pdf(
|
| 415 |
+
input_path: Path, page_numbers: list[int], request_id: str
|
| 416 |
+
) -> tuple[Path, dict[int, int]]:
|
| 417 |
+
"""Extract specific pages from a PDF into a mini-PDF.
|
| 418 |
+
|
| 419 |
+
Args:
|
| 420 |
+
input_path: Path to the original PDF
|
| 421 |
+
page_numbers: 0-indexed page numbers to extract
|
| 422 |
+
request_id: For logging
|
| 423 |
+
|
| 424 |
+
Returns:
|
| 425 |
+
(mini_pdf_path, page_map) where page_map maps Docling 1-indexed pages
|
| 426 |
+
in the mini-PDF back to 0-indexed original page numbers.
|
| 427 |
+
"""
|
| 428 |
+
from pypdf import PdfReader, PdfWriter
|
| 429 |
+
|
| 430 |
+
reader = PdfReader(str(input_path))
|
| 431 |
+
writer = PdfWriter()
|
| 432 |
+
|
| 433 |
+
# page_map: {docling_1indexed_mini_page: original_0indexed_page}
|
| 434 |
+
page_map: dict[int, int] = {}
|
| 435 |
+
for idx, orig_page in enumerate(page_numbers):
|
| 436 |
+
if orig_page < len(reader.pages):
|
| 437 |
+
writer.add_page(reader.pages[orig_page])
|
| 438 |
+
page_map[idx + 1] = orig_page # Docling uses 1-indexed pages
|
| 439 |
+
|
| 440 |
+
mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
|
| 441 |
+
with open(mini_pdf_path, "wb") as f:
|
| 442 |
+
writer.write(f)
|
| 443 |
+
|
| 444 |
+
logger.info(f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original")
|
| 445 |
+
return mini_pdf_path, page_map
|
| 446 |
+
|
| 447 |
+
|
| 448 |
# ---------------------------------------------------------------------------
|
| 449 |
# PDF to Page Images
|
| 450 |
# ---------------------------------------------------------------------------
|
|
|
|
| 562 |
end_page: Optional[int] = None,
|
| 563 |
) -> tuple:
|
| 564 |
"""
|
| 565 |
+
VLM-first hybrid conversion: Qwen3-VL for text + targeted TableFormer for tables.
|
| 566 |
|
| 567 |
+
Pass 1 (GPU): VLM OCR on ALL pages — fast concurrent processing
|
| 568 |
+
Detect: Identify pages with tables from VLM markdown output
|
| 569 |
+
Pass 2 (CPU): Docling TableFormer ONLY on table pages — minimal CPU work
|
| 570 |
+
Merge: VLM text + TableFormer tables
|
| 571 |
|
| 572 |
Returns: (markdown_content, json_content, pages_processed, image_count)
|
| 573 |
"""
|
| 574 |
+
total_start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
|
| 576 |
+
# --- RENDER: Convert PDF pages to images ---
|
| 577 |
+
render_start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
page_images = _pdf_to_page_images(input_path, start_page, end_page)
|
| 579 |
+
render_time = time.time() - render_start
|
| 580 |
+
logger.info(
|
| 581 |
+
f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s"
|
| 582 |
+
)
|
| 583 |
|
| 584 |
if not page_images:
|
| 585 |
+
logger.warning(
|
| 586 |
+
f"[{request_id}] No page images available, falling back to full Docling pipeline"
|
| 587 |
+
)
|
| 588 |
+
return _convert_document_full_docling(
|
| 589 |
+
input_path, output_dir, images_scale, include_images, request_id
|
| 590 |
)
|
| 591 |
+
|
| 592 |
+
# --- PASS 1 (GPU): VLM OCR on all pages ---
|
| 593 |
+
logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
|
| 594 |
|
| 595 |
vlm_page_texts: dict[int, Optional[str]] = {}
|
| 596 |
vlm_start = time.time()
|
| 597 |
|
|
|
|
| 598 |
max_workers = min(2, len(page_images))
|
| 599 |
+
logger.info(
|
| 600 |
+
f"[{request_id}] Sending {len(page_images)} pages to VLM ({max_workers} concurrent)"
|
| 601 |
+
)
|
| 602 |
|
| 603 |
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
| 604 |
futures = {
|
|
|
|
| 615 |
)
|
| 616 |
except Exception as e:
|
| 617 |
logger.warning(
|
| 618 |
+
f"[{request_id}] VLM failed on page {page_no + 1}: {e}"
|
| 619 |
)
|
| 620 |
vlm_page_texts[page_no] = None
|
| 621 |
|
| 622 |
vlm_time = time.time() - vlm_start
|
| 623 |
logger.info(
|
| 624 |
+
f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)"
|
| 625 |
)
|
| 626 |
|
| 627 |
+
# --- DETECT: Find pages with tables in VLM output ---
|
| 628 |
+
table_pages = _detect_table_pages(vlm_page_texts)
|
| 629 |
+
if table_pages:
|
| 630 |
+
logger.info(
|
| 631 |
+
f"[{request_id}] Tables detected on {len(table_pages)} pages: "
|
| 632 |
+
f"{sorted(p + 1 for p in table_pages)}"
|
| 633 |
+
)
|
| 634 |
+
else:
|
| 635 |
+
logger.info(f"[{request_id}] No tables detected — skipping Docling entirely")
|
| 636 |
|
| 637 |
+
# --- PASS 2 (CPU): Docling TableFormer ONLY on table pages ---
|
| 638 |
+
tables_by_page: dict[int, list[str]] = {}
|
| 639 |
+
pass2_time = 0.0
|
| 640 |
image_count = 0
|
| 641 |
image_dir = output_dir / "images"
|
| 642 |
|
| 643 |
+
if table_pages:
|
| 644 |
+
pass2_start = time.time()
|
| 645 |
+
logger.info(
|
| 646 |
+
f"[{request_id}] Pass 2: Docling TableFormer on {len(table_pages)} table pages"
|
| 647 |
+
)
|
| 648 |
|
| 649 |
+
try:
|
| 650 |
+
# Create mini-PDF containing only table pages
|
| 651 |
+
mini_pdf_path, page_map = _extract_pages_to_pdf(
|
| 652 |
+
input_path, sorted(table_pages), request_id
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
# Run Docling on mini-PDF (full pipeline for accurate table cell text)
|
| 656 |
+
converter = _get_converter()
|
| 657 |
+
result = converter.convert(mini_pdf_path)
|
| 658 |
+
doc = result.document
|
| 659 |
+
|
| 660 |
+
if doc:
|
| 661 |
+
# Extract tables, mapping mini-PDF pages back to original page numbers
|
| 662 |
+
for element, _ in doc.iterate_items():
|
| 663 |
+
if isinstance(element, TableItem):
|
| 664 |
+
mini_page = element.prov[0].page_no if element.prov else -1
|
| 665 |
+
orig_page = page_map.get(mini_page, mini_page)
|
| 666 |
+
table_md = element.export_to_markdown(doc=doc)
|
| 667 |
+
tables_by_page.setdefault(orig_page, []).append(table_md)
|
| 668 |
+
|
| 669 |
+
# Extract images from Docling if requested
|
| 670 |
+
if include_images:
|
| 671 |
+
image_dir.mkdir(parents=True, exist_ok=True)
|
| 672 |
+
for element, _ in doc.iterate_items():
|
| 673 |
+
if isinstance(element, PictureItem):
|
| 674 |
+
if element.image and element.image.pil_image:
|
| 675 |
+
pg = element.prov[0].page_no if element.prov else 0
|
| 676 |
+
orig_pg = page_map.get(pg, pg)
|
| 677 |
+
image_id = element.self_ref.split("/")[-1]
|
| 678 |
+
image_name = f"page_{orig_pg + 1}_{image_id}.png"
|
| 679 |
+
image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
|
| 680 |
+
image_path = image_dir / image_name
|
| 681 |
+
try:
|
| 682 |
+
element.image.pil_image.save(image_path, format="PNG")
|
| 683 |
+
image_count += 1
|
| 684 |
+
except Exception as e:
|
| 685 |
+
logger.warning(
|
| 686 |
+
f"[{request_id}] Failed to save image: {e}"
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
# Clean up mini-PDF
|
| 690 |
+
try:
|
| 691 |
+
os.unlink(mini_pdf_path)
|
| 692 |
+
except OSError:
|
| 693 |
+
pass
|
| 694 |
+
|
| 695 |
+
pass2_time = time.time() - pass2_start
|
| 696 |
+
total_tables = sum(len(v) for v in tables_by_page.values())
|
| 697 |
+
logger.info(
|
| 698 |
+
f"[{request_id}] Pass 2 completed in {pass2_time:.2f}s — "
|
| 699 |
+
f"{total_tables} TableFormer tables extracted"
|
| 700 |
+
)
|
| 701 |
+
|
| 702 |
+
except Exception as e:
|
| 703 |
+
pass2_time = time.time() - pass2_start
|
| 704 |
+
logger.warning(
|
| 705 |
+
f"[{request_id}] TableFormer pass failed ({e}), using VLM tables only"
|
| 706 |
+
)
|
| 707 |
+
|
| 708 |
+
# --- MERGE: VLM text + TableFormer tables ---
|
| 709 |
+
md_parts: list[str] = []
|
| 710 |
+
pages_seen: set[int] = set()
|
| 711 |
|
| 712 |
for page_no in sorted(vlm_page_texts.keys()):
|
| 713 |
pages_seen.add(page_no)
|
|
|
|
| 716 |
vlm_text = vlm_page_texts[page_no]
|
| 717 |
|
| 718 |
if vlm_text is None:
|
| 719 |
+
md_parts.append(f"<!-- VLM failed on this page -->\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
else:
|
|
|
|
| 721 |
page_tables = tables_by_page.get(page_no, [])
|
| 722 |
+
if page_tables:
|
| 723 |
+
merged = _merge_vlm_with_tables(vlm_text, page_tables)
|
| 724 |
+
md_parts.append(merged)
|
| 725 |
+
else:
|
| 726 |
+
md_parts.append(vlm_text)
|
| 727 |
+
|
| 728 |
+
markdown_content = "".join(md_parts)
|
| 729 |
+
pages_processed = len(pages_seen)
|
| 730 |
+
|
| 731 |
+
total_time = time.time() - total_start
|
| 732 |
+
logger.info(
|
| 733 |
+
f"[{request_id}] VLM-first conversion complete: {pages_processed} pages — "
|
| 734 |
+
f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
|
| 735 |
+
f"TableFormer {pass2_time:.1f}s = {total_time:.2f}s total"
|
| 736 |
+
)
|
| 737 |
+
if pages_processed > 0:
|
| 738 |
+
logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
|
| 739 |
+
|
| 740 |
+
return markdown_content, None, pages_processed, image_count
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
def _convert_document_full_docling(
|
| 744 |
+
input_path: Path,
|
| 745 |
+
output_dir: Path,
|
| 746 |
+
images_scale: float,
|
| 747 |
+
include_images: bool,
|
| 748 |
+
request_id: str,
|
| 749 |
+
) -> tuple:
|
| 750 |
+
"""Fallback: Full Docling pipeline when page images are unavailable."""
|
| 751 |
+
logger.info(f"[{request_id}] Running full Docling pipeline (fallback mode)")
|
| 752 |
+
|
| 753 |
+
converter = _get_converter()
|
| 754 |
+
start_time = time.time()
|
| 755 |
+
result = converter.convert(input_path)
|
| 756 |
+
doc = result.document
|
| 757 |
|
| 758 |
+
if doc is None:
|
| 759 |
+
raise ValueError("Docling failed to parse document")
|
| 760 |
+
|
| 761 |
+
elapsed = time.time() - start_time
|
| 762 |
+
logger.info(f"[{request_id}] Docling completed in {elapsed:.2f}s")
|
| 763 |
+
|
| 764 |
+
markdown_content = doc.export_to_markdown()
|
| 765 |
+
pages_processed = len(
|
| 766 |
+
set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
|
| 767 |
+
)
|
| 768 |
+
|
| 769 |
+
image_count = 0
|
| 770 |
if include_images:
|
| 771 |
+
image_dir = output_dir / "images"
|
| 772 |
+
image_dir.mkdir(parents=True, exist_ok=True)
|
| 773 |
for element, _ in doc.iterate_items():
|
| 774 |
if isinstance(element, PictureItem):
|
| 775 |
if element.image and element.image.pil_image:
|
|
|
|
| 781 |
try:
|
| 782 |
element.image.pil_image.save(image_path, format="PNG")
|
| 783 |
image_count += 1
|
| 784 |
+
except Exception:
|
| 785 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
|
| 787 |
return markdown_content, None, pages_processed, image_count
|
| 788 |
|
|
|
|
| 826 |
async def lifespan(app: FastAPI):
|
| 827 |
"""Startup: initialize Docling converter and check vLLM."""
|
| 828 |
logger.info("=" * 60)
|
| 829 |
+
logger.info("Starting Docling VLM Parser API v3.0.0...")
|
| 830 |
|
| 831 |
device = _get_device()
|
| 832 |
logger.info(f"Device: {device}")
|
|
|
|
| 863 |
logger.warning(f"Failed to pre-load Docling models: {e}")
|
| 864 |
|
| 865 |
logger.info("=" * 60)
|
| 866 |
+
logger.info("Docling VLM Parser API ready (VLM-first: Qwen3-VL + targeted TableFormer)")
|
| 867 |
logger.info("=" * 60)
|
| 868 |
yield
|
| 869 |
logger.info("Shutting down Docling VLM Parser API...")
|
|
|
|
| 875 |
|
| 876 |
app = FastAPI(
|
| 877 |
title="Docling VLM Parser API",
|
| 878 |
+
description="VLM-first hybrid parser: Qwen3-VL OCR (GPU) + targeted TableFormer (CPU)",
|
| 879 |
+
version="3.0.0",
|
| 880 |
lifespan=lifespan,
|
| 881 |
)
|
| 882 |
|
|
|
|
| 905 |
|
| 906 |
return HealthResponse(
|
| 907 |
status="healthy",
|
| 908 |
+
version="3.0.0",
|
| 909 |
device=device,
|
| 910 |
gpu_name=None, # Don't leak GPU details on unauthenticated endpoint
|
| 911 |
vlm_model="active", # Confirm VLM is configured without leaking model name
|
|
|
|
| 927 |
"""
|
| 928 |
Parse a document file (PDF or image) and return extracted content.
|
| 929 |
|
| 930 |
+
Uses a VLM-first hybrid approach:
|
| 931 |
+
Pass 1 (GPU): Qwen3-VL via vLLM for OCR on all pages (concurrent)
|
| 932 |
+
Detect: Identify pages with tables from VLM output
|
| 933 |
+
Pass 2 (CPU): Docling TableFormer only on table pages
|
| 934 |
+
Merge: VLM text + TableFormer tables
|
| 935 |
|
| 936 |
Supports:
|
| 937 |
- PDF files (.pdf)
|
|
|
|
| 949 |
if output_format not in ("markdown",):
|
| 950 |
raise HTTPException(
|
| 951 |
status_code=400,
|
| 952 |
+
detail="Only 'markdown' output_format is supported in v3.0.0",
|
| 953 |
)
|
| 954 |
|
| 955 |
# Validate file size
|
|
|
|
| 1073 |
if request.output_format not in ("markdown",):
|
| 1074 |
raise HTTPException(
|
| 1075 |
status_code=400,
|
| 1076 |
+
detail="Only 'markdown' output_format is supported in v3.0.0",
|
| 1077 |
)
|
| 1078 |
|
| 1079 |
# Validate URL
|
requirements.txt
CHANGED
|
@@ -26,5 +26,8 @@ onnxruntime>=1.19.0
|
|
| 26 |
# PDF to image conversion for VLM OCR pass
|
| 27 |
pdf2image>=1.17.0
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
# HuggingFace Hub for model downloads
|
| 30 |
huggingface-hub>=0.25.0
|
|
|
|
| 26 |
# PDF to image conversion for VLM OCR pass
|
| 27 |
pdf2image>=1.17.0
|
| 28 |
|
| 29 |
+
# PDF page extraction (for creating mini-PDFs with only table pages)
|
| 30 |
+
pypdf>=4.0.0
|
| 31 |
+
|
| 32 |
# HuggingFace Hub for model downloads
|
| 33 |
huggingface-hub>=0.25.0
|