Spaces:

outcomelabs
/

docling-parser

Running on T4

App Files Files Community

sidoutcome commited on Mar 13

Commit

c67903b

1 Parent(s): 3f46c5e

feat: v3.0.0 VLM-first hybrid architecture — GPU VLM on all pages, Docling TableFormer only on table pages

Browse files

Files changed (2) hide show

app.py +237 -98
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -1,18 +1,23 @@
 """
-Docling VLM Parser API v2.0.0
-A FastAPI service that uses a HYBRID two-pass approach for document parsing:
-  Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR) for document structure
-  Pass 2: Qwen3-VL-30B-A3B via vLLM for enhanced text recognition
-  Merge:  TableFormer tables preserved, VLM text replaces RapidOCR text
 Features:
-- GPU-accelerated parsing with CUDA support
-- TableFormer ACCURATE for table structure detection
-- Qwen3-VL via vLLM for superior OCR accuracy
-- OpenCV image preprocessing (deskew, denoise, CLAHE)
 - Image extraction with configurable resolution
-- Automatic page chunking for large PDFs
 """
 import asyncio
@@ -386,6 +391,60 @@ def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list) -> str:
     return result
 # ---------------------------------------------------------------------------
 # PDF to Page Images
 # ---------------------------------------------------------------------------
@@ -503,56 +562,43 @@ def _convert_document(
     end_page: Optional[int] = None,
 ) -> tuple:
     """
-    Hybrid conversion: TableFormer for tables + Qwen3-VL for text.
-    Pass 1: Docling Standard Pipeline -> document structure + tables
-    Pass 2: VLM OCR -> enhanced text recognition per page
-    Merge: TableFormer tables + VLM text
     Returns: (markdown_content, json_content, pages_processed, image_count)
     """
-    # PASS 1: Docling Standard Pipeline (structure + tables)
-    logger.info(
-        f"[{request_id}] Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR)"
-    )
-    converter = _get_converter()
-    start_time = time.time()
-    result = converter.convert(input_path)
-    doc = result.document
-    if doc is None:
-        raise ValueError(
-            f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
-        )
-    pass1_time = time.time() - start_time
-    logger.info(f"[{request_id}] Pass 1 completed in {pass1_time:.2f}s")
-    # Extract TableFormer tables (keyed by page number)
-    tables_by_page = _extract_table_markdowns(doc)
-    total_tables = sum(len(v) for v in tables_by_page.values())
-    logger.info(f"[{request_id}] TableFormer detected {total_tables} tables")
-    # PASS 2: VLM OCR (enhanced text per page)
-    logger.info(f"[{request_id}] Pass 2: VLM OCR via Qwen3-VL ({VLM_MODEL})")
-    # Get page images for VLM
     page_images = _pdf_to_page_images(input_path, start_page, end_page)
     if not page_images:
-        # Fallback: use Docling's markdown directly if no page images
-        logger.warning(f"[{request_id}] No page images available, using Docling output only")
-        markdown_content = doc.export_to_markdown()
-        pages_processed = len(
-            set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
         )
-        return markdown_content, None, pages_processed, 0
     vlm_page_texts: dict[int, Optional[str]] = {}
     vlm_start = time.time()
-    # Process pages concurrently — vLLM supports batching via --max-num-seqs
     max_workers = min(2, len(page_images))
-    logger.info(f"[{request_id}] Sending {len(page_images)} pages to VLM ({max_workers} concurrent)")
     with ThreadPoolExecutor(max_workers=max_workers) as pool:
         futures = {
@@ -569,32 +615,99 @@ def _convert_document(
                 )
             except Exception as e:
                 logger.warning(
-                    f"[{request_id}] VLM failed on page {page_no + 1}: {e}, using Docling text"
                 )
                 vlm_page_texts[page_no] = None
     vlm_time = time.time() - vlm_start
     logger.info(
-        f"[{request_id}] Pass 2 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)"
     )
-    # MERGE: VLM text + TableFormer tables
-    logger.info(f"[{request_id}] Merging VLM text with TableFormer tables")
-    md_parts: list[str] = []
-    pages_seen: set[int] = set()
     image_count = 0
     image_dir = output_dir / "images"
-    if include_images:
-        image_dir.mkdir(parents=True, exist_ok=True)
-    # Pre-build page-to-elements index (avoids O(N^2) on VLM fallback)
-    elements_by_page: dict[int, list] = {}
-    for element, _ in doc.iterate_items():
-        if element.prov:
-            pg = element.prov[0].page_no
-            elements_by_page.setdefault(pg, []).append(element)
     for page_no in sorted(vlm_page_texts.keys()):
         pages_seen.add(page_no)
@@ -603,22 +716,60 @@ def _convert_document(
         vlm_text = vlm_page_texts[page_no]
         if vlm_text is None:
-            # VLM failed -- fallback to Docling's text for this page
-            for element in elements_by_page.get(page_no, []):
-                try:
-                    md_parts.append(element.export_to_markdown(doc=doc))
-                except Exception:
-                    text = getattr(element, "text", "").strip()
-                    if text:
-                        md_parts.append(text + "\n\n")
         else:
-            # Merge VLM text with TableFormer tables for this page
             page_tables = tables_by_page.get(page_no, [])
-            merged = _merge_vlm_with_tables(vlm_text, page_tables)
-            md_parts.append(merged)
-    # Handle images from Docling if requested
     if include_images:
         for element, _ in doc.iterate_items():
             if isinstance(element, PictureItem):
                 if element.image and element.image.pil_image:
@@ -630,21 +781,8 @@ def _convert_document(
                     try:
                         element.image.pil_image.save(image_path, format="PNG")
                         image_count += 1
-                    except Exception as e:
-                        logger.warning(
-                            f"[{request_id}] Failed to save image {image_name}: {e}"
-                        )
-    markdown_content = "".join(md_parts)
-    pages_processed = len(pages_seen)
-    total_time = pass1_time + vlm_time
-    logger.info(
-        f"[{request_id}] Hybrid conversion complete: {pages_processed} pages, "
-        f"{total_tables} tables, {total_time:.2f}s total"
-    )
-    if pages_processed > 0:
-        logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
     return markdown_content, None, pages_processed, image_count
@@ -688,7 +826,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
 async def lifespan(app: FastAPI):
     """Startup: initialize Docling converter and check vLLM."""
     logger.info("=" * 60)
-    logger.info("Starting Docling VLM Parser API v2.0.0...")
     device = _get_device()
     logger.info(f"Device: {device}")
@@ -725,7 +863,7 @@ async def lifespan(app: FastAPI):
         logger.warning(f"Failed to pre-load Docling models: {e}")
     logger.info("=" * 60)
-    logger.info("Docling VLM Parser API ready (Hybrid: TableFormer + Qwen3-VL)")
     logger.info("=" * 60)
     yield
     logger.info("Shutting down Docling VLM Parser API...")
@@ -737,8 +875,8 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
     title="Docling VLM Parser API",
-    description="Hybrid document parser: TableFormer tables + Qwen3-VL OCR via vLLM",
-    version="2.0.0",
     lifespan=lifespan,
 )
@@ -767,7 +905,7 @@ async def health_check() -> HealthResponse:
     return HealthResponse(
         status="healthy",
-        version="2.0.0",
         device=device,
         gpu_name=None,  # Don't leak GPU details on unauthenticated endpoint
         vlm_model="active",  # Confirm VLM is configured without leaking model name
@@ -789,10 +927,11 @@ async def parse_document(
     """
     Parse a document file (PDF or image) and return extracted content.
-    Uses a hybrid two-pass approach:
-      Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR)
-      Pass 2: Qwen3-VL via vLLM for enhanced text recognition
-      Merge: TableFormer tables preserved, VLM text replaces RapidOCR text
     Supports:
     - PDF files (.pdf)
@@ -810,7 +949,7 @@ async def parse_document(
     if output_format not in ("markdown",):
         raise HTTPException(
             status_code=400,
-            detail="Only 'markdown' output_format is supported in v2.0.0",
         )
     # Validate file size
@@ -934,7 +1073,7 @@ async def parse_document_from_url(
     if request.output_format not in ("markdown",):
         raise HTTPException(
             status_code=400,
-            detail="Only 'markdown' output_format is supported in v2.0.0",
         )
     # Validate URL

 """
+Docling VLM Parser API v3.0.0
+A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
+  Pass 1 (GPU):  Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
+  Detect:        Identify pages with tables from VLM markdown output
+  Pass 2 (CPU):  Docling TableFormer ONLY on table pages (targeted, minimal)
+  Merge:         VLM text for all pages + TableFormer tables where detected
+Key insight: the previous architecture ran Docling's full CPU pipeline (DocLayNet +
+TableFormer + RapidOCR) on ALL pages, taking 60-565s. Most of that time was wasted
+on non-table pages. Now we run the fast GPU VLM first, detect which pages have tables,
+and only send those pages (as a mini-PDF) to Docling for table structure extraction.
 Features:
+- VLM-first: GPU-accelerated OCR on all pages via Qwen3-VL (concurrent)
+- Targeted TableFormer: CPU pipeline runs only on pages with tables
+- pypdf mini-PDF extraction for page-level Docling targeting
+- OpenCV image preprocessing (denoise, CLAHE contrast enhancement)
 - Image extraction with configurable resolution
 """
 import asyncio
     return result
+# ---------------------------------------------------------------------------
+# Table Detection from VLM Output
+# ---------------------------------------------------------------------------
+def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
+    """Detect pages containing tables from VLM markdown output.
+    Looks for markdown table separator rows (e.g., | --- | --- |) which are
+    a reliable signal of table content. Returns set of 0-indexed page numbers.
+    """
+    # Markdown table separator: | --- | --- | (with optional colons for alignment)
+    separator_pattern = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE)
+    table_pages: set[int] = set()
+    for page_no, text in vlm_page_texts.items():
+        if text and separator_pattern.search(text):
+            table_pages.add(page_no)
+    return table_pages
+def _extract_pages_to_pdf(
+    input_path: Path, page_numbers: list[int], request_id: str
+) -> tuple[Path, dict[int, int]]:
+    """Extract specific pages from a PDF into a mini-PDF.
+    Args:
+        input_path: Path to the original PDF
+        page_numbers: 0-indexed page numbers to extract
+        request_id: For logging
+    Returns:
+        (mini_pdf_path, page_map) where page_map maps Docling 1-indexed pages
+        in the mini-PDF back to 0-indexed original page numbers.
+    """
+    from pypdf import PdfReader, PdfWriter
+    reader = PdfReader(str(input_path))
+    writer = PdfWriter()
+    # page_map: {docling_1indexed_mini_page: original_0indexed_page}
+    page_map: dict[int, int] = {}
+    for idx, orig_page in enumerate(page_numbers):
+        if orig_page < len(reader.pages):
+            writer.add_page(reader.pages[orig_page])
+            page_map[idx + 1] = orig_page  # Docling uses 1-indexed pages
+    mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
+    with open(mini_pdf_path, "wb") as f:
+        writer.write(f)
+    logger.info(f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original")
+    return mini_pdf_path, page_map
 # ---------------------------------------------------------------------------
 # PDF to Page Images
 # ---------------------------------------------------------------------------
     end_page: Optional[int] = None,
 ) -> tuple:
     """
+    VLM-first hybrid conversion: Qwen3-VL for text + targeted TableFormer for tables.
+    Pass 1 (GPU):  VLM OCR on ALL pages — fast concurrent processing
+    Detect:        Identify pages with tables from VLM markdown output
+    Pass 2 (CPU):  Docling TableFormer ONLY on table pages — minimal CPU work
+    Merge:         VLM text + TableFormer tables
     Returns: (markdown_content, json_content, pages_processed, image_count)
     """
+    total_start = time.time()
+    # --- RENDER: Convert PDF pages to images ---
+    render_start = time.time()
     page_images = _pdf_to_page_images(input_path, start_page, end_page)
+    render_time = time.time() - render_start
+    logger.info(
+        f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s"
+    )
     if not page_images:
+        logger.warning(
+            f"[{request_id}] No page images available, falling back to full Docling pipeline"
+        )
+        return _convert_document_full_docling(
+            input_path, output_dir, images_scale, include_images, request_id
         )
+    # --- PASS 1 (GPU): VLM OCR on all pages ---
+    logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
     vlm_page_texts: dict[int, Optional[str]] = {}
     vlm_start = time.time()
     max_workers = min(2, len(page_images))
+    logger.info(
+        f"[{request_id}] Sending {len(page_images)} pages to VLM ({max_workers} concurrent)"
+    )
     with ThreadPoolExecutor(max_workers=max_workers) as pool:
         futures = {
                 )
             except Exception as e:
                 logger.warning(
+                    f"[{request_id}] VLM failed on page {page_no + 1}: {e}"
                 )
                 vlm_page_texts[page_no] = None
     vlm_time = time.time() - vlm_start
     logger.info(
+        f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)"
     )
+    # --- DETECT: Find pages with tables in VLM output ---
+    table_pages = _detect_table_pages(vlm_page_texts)
+    if table_pages:
+        logger.info(
+            f"[{request_id}] Tables detected on {len(table_pages)} pages: "
+            f"{sorted(p + 1 for p in table_pages)}"
+        )
+    else:
+        logger.info(f"[{request_id}] No tables detected — skipping Docling entirely")
+    # --- PASS 2 (CPU): Docling TableFormer ONLY on table pages ---
+    tables_by_page: dict[int, list[str]] = {}
+    pass2_time = 0.0
     image_count = 0
     image_dir = output_dir / "images"
+    if table_pages:
+        pass2_start = time.time()
+        logger.info(
+            f"[{request_id}] Pass 2: Docling TableFormer on {len(table_pages)} table pages"
+        )
+        try:
+            # Create mini-PDF containing only table pages
+            mini_pdf_path, page_map = _extract_pages_to_pdf(
+                input_path, sorted(table_pages), request_id
+            )
+            # Run Docling on mini-PDF (full pipeline for accurate table cell text)
+            converter = _get_converter()
+            result = converter.convert(mini_pdf_path)
+            doc = result.document
+            if doc:
+                # Extract tables, mapping mini-PDF pages back to original page numbers
+                for element, _ in doc.iterate_items():
+                    if isinstance(element, TableItem):
+                        mini_page = element.prov[0].page_no if element.prov else -1
+                        orig_page = page_map.get(mini_page, mini_page)
+                        table_md = element.export_to_markdown(doc=doc)
+                        tables_by_page.setdefault(orig_page, []).append(table_md)
+                # Extract images from Docling if requested
+                if include_images:
+                    image_dir.mkdir(parents=True, exist_ok=True)
+                    for element, _ in doc.iterate_items():
+                        if isinstance(element, PictureItem):
+                            if element.image and element.image.pil_image:
+                                pg = element.prov[0].page_no if element.prov else 0
+                                orig_pg = page_map.get(pg, pg)
+                                image_id = element.self_ref.split("/")[-1]
+                                image_name = f"page_{orig_pg + 1}_{image_id}.png"
+                                image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
+                                image_path = image_dir / image_name
+                                try:
+                                    element.image.pil_image.save(image_path, format="PNG")
+                                    image_count += 1
+                                except Exception as e:
+                                    logger.warning(
+                                        f"[{request_id}] Failed to save image: {e}"
+                                    )
+            # Clean up mini-PDF
+            try:
+                os.unlink(mini_pdf_path)
+            except OSError:
+                pass
+            pass2_time = time.time() - pass2_start
+            total_tables = sum(len(v) for v in tables_by_page.values())
+            logger.info(
+                f"[{request_id}] Pass 2 completed in {pass2_time:.2f}s — "
+                f"{total_tables} TableFormer tables extracted"
+            )
+        except Exception as e:
+            pass2_time = time.time() - pass2_start
+            logger.warning(
+                f"[{request_id}] TableFormer pass failed ({e}), using VLM tables only"
+            )
+    # --- MERGE: VLM text + TableFormer tables ---
+    md_parts: list[str] = []
+    pages_seen: set[int] = set()
     for page_no in sorted(vlm_page_texts.keys()):
         pages_seen.add(page_no)
         vlm_text = vlm_page_texts[page_no]
         if vlm_text is None:
+            md_parts.append(f"<!-- VLM failed on this page -->\n")
         else:
             page_tables = tables_by_page.get(page_no, [])
+            if page_tables:
+                merged = _merge_vlm_with_tables(vlm_text, page_tables)
+                md_parts.append(merged)
+            else:
+                md_parts.append(vlm_text)
+    markdown_content = "".join(md_parts)
+    pages_processed = len(pages_seen)
+    total_time = time.time() - total_start
+    logger.info(
+        f"[{request_id}] VLM-first conversion complete: {pages_processed} pages — "
+        f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
+        f"TableFormer {pass2_time:.1f}s = {total_time:.2f}s total"
+    )
+    if pages_processed > 0:
+        logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
+    return markdown_content, None, pages_processed, image_count
+def _convert_document_full_docling(
+    input_path: Path,
+    output_dir: Path,
+    images_scale: float,
+    include_images: bool,
+    request_id: str,
+) -> tuple:
+    """Fallback: Full Docling pipeline when page images are unavailable."""
+    logger.info(f"[{request_id}] Running full Docling pipeline (fallback mode)")
+    converter = _get_converter()
+    start_time = time.time()
+    result = converter.convert(input_path)
+    doc = result.document
+    if doc is None:
+        raise ValueError("Docling failed to parse document")
+    elapsed = time.time() - start_time
+    logger.info(f"[{request_id}] Docling completed in {elapsed:.2f}s")
+    markdown_content = doc.export_to_markdown()
+    pages_processed = len(
+        set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
+    )
+    image_count = 0
     if include_images:
+        image_dir = output_dir / "images"
+        image_dir.mkdir(parents=True, exist_ok=True)
         for element, _ in doc.iterate_items():
             if isinstance(element, PictureItem):
                 if element.image and element.image.pil_image:
                     try:
                         element.image.pil_image.save(image_path, format="PNG")
                         image_count += 1
+                    except Exception:
+                        pass
     return markdown_content, None, pages_processed, image_count
 async def lifespan(app: FastAPI):
     """Startup: initialize Docling converter and check vLLM."""
     logger.info("=" * 60)
+    logger.info("Starting Docling VLM Parser API v3.0.0...")
     device = _get_device()
     logger.info(f"Device: {device}")
         logger.warning(f"Failed to pre-load Docling models: {e}")
     logger.info("=" * 60)
+    logger.info("Docling VLM Parser API ready (VLM-first: Qwen3-VL + targeted TableFormer)")
     logger.info("=" * 60)
     yield
     logger.info("Shutting down Docling VLM Parser API...")
 app = FastAPI(
     title="Docling VLM Parser API",
+    description="VLM-first hybrid parser: Qwen3-VL OCR (GPU) + targeted TableFormer (CPU)",
+    version="3.0.0",
     lifespan=lifespan,
 )
     return HealthResponse(
         status="healthy",
+        version="3.0.0",
         device=device,
         gpu_name=None,  # Don't leak GPU details on unauthenticated endpoint
         vlm_model="active",  # Confirm VLM is configured without leaking model name
     """
     Parse a document file (PDF or image) and return extracted content.
+    Uses a VLM-first hybrid approach:
+      Pass 1 (GPU): Qwen3-VL via vLLM for OCR on all pages (concurrent)
+      Detect: Identify pages with tables from VLM output
+      Pass 2 (CPU): Docling TableFormer only on table pages
+      Merge: VLM text + TableFormer tables
     Supports:
     - PDF files (.pdf)
     if output_format not in ("markdown",):
         raise HTTPException(
             status_code=400,
+            detail="Only 'markdown' output_format is supported in v3.0.0",
         )
     # Validate file size
     if request.output_format not in ("markdown",):
         raise HTTPException(
             status_code=400,
+            detail="Only 'markdown' output_format is supported in v3.0.0",
         )
     # Validate URL

requirements.txt CHANGED Viewed

@@ -26,5 +26,8 @@ onnxruntime>=1.19.0
 # PDF to image conversion for VLM OCR pass
 pdf2image>=1.17.0
 # HuggingFace Hub for model downloads
 huggingface-hub>=0.25.0

 # PDF to image conversion for VLM OCR pass
 pdf2image>=1.17.0
+# PDF page extraction (for creating mini-PDFs with only table pages)
+pypdf>=4.0.0
 # HuggingFace Hub for model downloads
 huggingface-hub>=0.25.0