sidoutcome commited on
Commit
c67903b
·
1 Parent(s): 3f46c5e

feat: v3.0.0 VLM-first hybrid architecture — GPU VLM on all pages, Docling TableFormer only on table pages

Browse files
Files changed (2) hide show
  1. app.py +237 -98
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,18 +1,23 @@
1
  """
2
- Docling VLM Parser API v2.0.0
3
 
4
- A FastAPI service that uses a HYBRID two-pass approach for document parsing:
5
- Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR) for document structure
6
- Pass 2: Qwen3-VL-30B-A3B via vLLM for enhanced text recognition
7
- Merge: TableFormer tables preserved, VLM text replaces RapidOCR text
 
 
 
 
 
 
8
 
9
  Features:
10
- - GPU-accelerated parsing with CUDA support
11
- - TableFormer ACCURATE for table structure detection
12
- - Qwen3-VL via vLLM for superior OCR accuracy
13
- - OpenCV image preprocessing (deskew, denoise, CLAHE)
14
  - Image extraction with configurable resolution
15
- - Automatic page chunking for large PDFs
16
  """
17
 
18
  import asyncio
@@ -386,6 +391,60 @@ def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list) -> str:
386
  return result
387
 
388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  # ---------------------------------------------------------------------------
390
  # PDF to Page Images
391
  # ---------------------------------------------------------------------------
@@ -503,56 +562,43 @@ def _convert_document(
503
  end_page: Optional[int] = None,
504
  ) -> tuple:
505
  """
506
- Hybrid conversion: TableFormer for tables + Qwen3-VL for text.
507
 
508
- Pass 1: Docling Standard Pipeline -> document structure + tables
509
- Pass 2: VLM OCR -> enhanced text recognition per page
510
- Merge: TableFormer tables + VLM text
 
511
 
512
  Returns: (markdown_content, json_content, pages_processed, image_count)
513
  """
514
- # PASS 1: Docling Standard Pipeline (structure + tables)
515
- logger.info(
516
- f"[{request_id}] Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR)"
517
- )
518
- converter = _get_converter()
519
-
520
- start_time = time.time()
521
- result = converter.convert(input_path)
522
- doc = result.document
523
- if doc is None:
524
- raise ValueError(
525
- f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
526
- )
527
- pass1_time = time.time() - start_time
528
- logger.info(f"[{request_id}] Pass 1 completed in {pass1_time:.2f}s")
529
 
530
- # Extract TableFormer tables (keyed by page number)
531
- tables_by_page = _extract_table_markdowns(doc)
532
- total_tables = sum(len(v) for v in tables_by_page.values())
533
- logger.info(f"[{request_id}] TableFormer detected {total_tables} tables")
534
-
535
- # PASS 2: VLM OCR (enhanced text per page)
536
- logger.info(f"[{request_id}] Pass 2: VLM OCR via Qwen3-VL ({VLM_MODEL})")
537
-
538
- # Get page images for VLM
539
  page_images = _pdf_to_page_images(input_path, start_page, end_page)
 
 
 
 
540
 
541
  if not page_images:
542
- # Fallback: use Docling's markdown directly if no page images
543
- logger.warning(f"[{request_id}] No page images available, using Docling output only")
544
- markdown_content = doc.export_to_markdown()
545
- pages_processed = len(
546
- set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
547
  )
548
- return markdown_content, None, pages_processed, 0
 
 
549
 
550
  vlm_page_texts: dict[int, Optional[str]] = {}
551
  vlm_start = time.time()
552
 
553
- # Process pages concurrently — vLLM supports batching via --max-num-seqs
554
  max_workers = min(2, len(page_images))
555
- logger.info(f"[{request_id}] Sending {len(page_images)} pages to VLM ({max_workers} concurrent)")
 
 
556
 
557
  with ThreadPoolExecutor(max_workers=max_workers) as pool:
558
  futures = {
@@ -569,32 +615,99 @@ def _convert_document(
569
  )
570
  except Exception as e:
571
  logger.warning(
572
- f"[{request_id}] VLM failed on page {page_no + 1}: {e}, using Docling text"
573
  )
574
  vlm_page_texts[page_no] = None
575
 
576
  vlm_time = time.time() - vlm_start
577
  logger.info(
578
- f"[{request_id}] Pass 2 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)"
579
  )
580
 
581
- # MERGE: VLM text + TableFormer tables
582
- logger.info(f"[{request_id}] Merging VLM text with TableFormer tables")
 
 
 
 
 
 
 
583
 
584
- md_parts: list[str] = []
585
- pages_seen: set[int] = set()
 
586
  image_count = 0
587
  image_dir = output_dir / "images"
588
 
589
- if include_images:
590
- image_dir.mkdir(parents=True, exist_ok=True)
 
 
 
591
 
592
- # Pre-build page-to-elements index (avoids O(N^2) on VLM fallback)
593
- elements_by_page: dict[int, list] = {}
594
- for element, _ in doc.iterate_items():
595
- if element.prov:
596
- pg = element.prov[0].page_no
597
- elements_by_page.setdefault(pg, []).append(element)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
  for page_no in sorted(vlm_page_texts.keys()):
600
  pages_seen.add(page_no)
@@ -603,22 +716,60 @@ def _convert_document(
603
  vlm_text = vlm_page_texts[page_no]
604
 
605
  if vlm_text is None:
606
- # VLM failed -- fallback to Docling's text for this page
607
- for element in elements_by_page.get(page_no, []):
608
- try:
609
- md_parts.append(element.export_to_markdown(doc=doc))
610
- except Exception:
611
- text = getattr(element, "text", "").strip()
612
- if text:
613
- md_parts.append(text + "\n\n")
614
  else:
615
- # Merge VLM text with TableFormer tables for this page
616
  page_tables = tables_by_page.get(page_no, [])
617
- merged = _merge_vlm_with_tables(vlm_text, page_tables)
618
- md_parts.append(merged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
 
620
- # Handle images from Docling if requested
 
 
 
 
 
 
 
 
 
 
 
621
  if include_images:
 
 
622
  for element, _ in doc.iterate_items():
623
  if isinstance(element, PictureItem):
624
  if element.image and element.image.pil_image:
@@ -630,21 +781,8 @@ def _convert_document(
630
  try:
631
  element.image.pil_image.save(image_path, format="PNG")
632
  image_count += 1
633
- except Exception as e:
634
- logger.warning(
635
- f"[{request_id}] Failed to save image {image_name}: {e}"
636
- )
637
-
638
- markdown_content = "".join(md_parts)
639
- pages_processed = len(pages_seen)
640
-
641
- total_time = pass1_time + vlm_time
642
- logger.info(
643
- f"[{request_id}] Hybrid conversion complete: {pages_processed} pages, "
644
- f"{total_tables} tables, {total_time:.2f}s total"
645
- )
646
- if pages_processed > 0:
647
- logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
648
 
649
  return markdown_content, None, pages_processed, image_count
650
 
@@ -688,7 +826,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
688
  async def lifespan(app: FastAPI):
689
  """Startup: initialize Docling converter and check vLLM."""
690
  logger.info("=" * 60)
691
- logger.info("Starting Docling VLM Parser API v2.0.0...")
692
 
693
  device = _get_device()
694
  logger.info(f"Device: {device}")
@@ -725,7 +863,7 @@ async def lifespan(app: FastAPI):
725
  logger.warning(f"Failed to pre-load Docling models: {e}")
726
 
727
  logger.info("=" * 60)
728
- logger.info("Docling VLM Parser API ready (Hybrid: TableFormer + Qwen3-VL)")
729
  logger.info("=" * 60)
730
  yield
731
  logger.info("Shutting down Docling VLM Parser API...")
@@ -737,8 +875,8 @@ async def lifespan(app: FastAPI):
737
 
738
  app = FastAPI(
739
  title="Docling VLM Parser API",
740
- description="Hybrid document parser: TableFormer tables + Qwen3-VL OCR via vLLM",
741
- version="2.0.0",
742
  lifespan=lifespan,
743
  )
744
 
@@ -767,7 +905,7 @@ async def health_check() -> HealthResponse:
767
 
768
  return HealthResponse(
769
  status="healthy",
770
- version="2.0.0",
771
  device=device,
772
  gpu_name=None, # Don't leak GPU details on unauthenticated endpoint
773
  vlm_model="active", # Confirm VLM is configured without leaking model name
@@ -789,10 +927,11 @@ async def parse_document(
789
  """
790
  Parse a document file (PDF or image) and return extracted content.
791
 
792
- Uses a hybrid two-pass approach:
793
- Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR)
794
- Pass 2: Qwen3-VL via vLLM for enhanced text recognition
795
- Merge: TableFormer tables preserved, VLM text replaces RapidOCR text
 
796
 
797
  Supports:
798
  - PDF files (.pdf)
@@ -810,7 +949,7 @@ async def parse_document(
810
  if output_format not in ("markdown",):
811
  raise HTTPException(
812
  status_code=400,
813
- detail="Only 'markdown' output_format is supported in v2.0.0",
814
  )
815
 
816
  # Validate file size
@@ -934,7 +1073,7 @@ async def parse_document_from_url(
934
  if request.output_format not in ("markdown",):
935
  raise HTTPException(
936
  status_code=400,
937
- detail="Only 'markdown' output_format is supported in v2.0.0",
938
  )
939
 
940
  # Validate URL
 
1
  """
2
+ Docling VLM Parser API v3.0.0
3
 
4
+ A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
5
+ Pass 1 (GPU): Qwen3-VL via vLLM concurrent OCR on ALL pages (fast)
6
+ Detect: Identify pages with tables from VLM markdown output
7
+ Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
8
+ Merge: VLM text for all pages + TableFormer tables where detected
9
+
10
+ Key insight: the previous architecture ran Docling's full CPU pipeline (DocLayNet +
11
+ TableFormer + RapidOCR) on ALL pages, taking 60-565s. Most of that time was wasted
12
+ on non-table pages. Now we run the fast GPU VLM first, detect which pages have tables,
13
+ and only send those pages (as a mini-PDF) to Docling for table structure extraction.
14
 
15
  Features:
16
+ - VLM-first: GPU-accelerated OCR on all pages via Qwen3-VL (concurrent)
17
+ - Targeted TableFormer: CPU pipeline runs only on pages with tables
18
+ - pypdf mini-PDF extraction for page-level Docling targeting
19
+ - OpenCV image preprocessing (denoise, CLAHE contrast enhancement)
20
  - Image extraction with configurable resolution
 
21
  """
22
 
23
  import asyncio
 
391
  return result
392
 
393
 
394
+ # ---------------------------------------------------------------------------
395
+ # Table Detection from VLM Output
396
+ # ---------------------------------------------------------------------------
397
+
398
+
399
+ def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
400
+ """Detect pages containing tables from VLM markdown output.
401
+
402
+ Looks for markdown table separator rows (e.g., | --- | --- |) which are
403
+ a reliable signal of table content. Returns set of 0-indexed page numbers.
404
+ """
405
+ # Markdown table separator: | --- | --- | (with optional colons for alignment)
406
+ separator_pattern = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE)
407
+ table_pages: set[int] = set()
408
+ for page_no, text in vlm_page_texts.items():
409
+ if text and separator_pattern.search(text):
410
+ table_pages.add(page_no)
411
+ return table_pages
412
+
413
+
414
+ def _extract_pages_to_pdf(
415
+ input_path: Path, page_numbers: list[int], request_id: str
416
+ ) -> tuple[Path, dict[int, int]]:
417
+ """Extract specific pages from a PDF into a mini-PDF.
418
+
419
+ Args:
420
+ input_path: Path to the original PDF
421
+ page_numbers: 0-indexed page numbers to extract
422
+ request_id: For logging
423
+
424
+ Returns:
425
+ (mini_pdf_path, page_map) where page_map maps Docling 1-indexed pages
426
+ in the mini-PDF back to 0-indexed original page numbers.
427
+ """
428
+ from pypdf import PdfReader, PdfWriter
429
+
430
+ reader = PdfReader(str(input_path))
431
+ writer = PdfWriter()
432
+
433
+ # page_map: {docling_1indexed_mini_page: original_0indexed_page}
434
+ page_map: dict[int, int] = {}
435
+ for idx, orig_page in enumerate(page_numbers):
436
+ if orig_page < len(reader.pages):
437
+ writer.add_page(reader.pages[orig_page])
438
+ page_map[idx + 1] = orig_page # Docling uses 1-indexed pages
439
+
440
+ mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
441
+ with open(mini_pdf_path, "wb") as f:
442
+ writer.write(f)
443
+
444
+ logger.info(f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original")
445
+ return mini_pdf_path, page_map
446
+
447
+
448
  # ---------------------------------------------------------------------------
449
  # PDF to Page Images
450
  # ---------------------------------------------------------------------------
 
562
  end_page: Optional[int] = None,
563
  ) -> tuple:
564
  """
565
+ VLM-first hybrid conversion: Qwen3-VL for text + targeted TableFormer for tables.
566
 
567
+ Pass 1 (GPU): VLM OCR on ALL pages fast concurrent processing
568
+ Detect: Identify pages with tables from VLM markdown output
569
+ Pass 2 (CPU): Docling TableFormer ONLY on table pages — minimal CPU work
570
+ Merge: VLM text + TableFormer tables
571
 
572
  Returns: (markdown_content, json_content, pages_processed, image_count)
573
  """
574
+ total_start = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
+ # --- RENDER: Convert PDF pages to images ---
577
+ render_start = time.time()
 
 
 
 
 
 
 
578
  page_images = _pdf_to_page_images(input_path, start_page, end_page)
579
+ render_time = time.time() - render_start
580
+ logger.info(
581
+ f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s"
582
+ )
583
 
584
  if not page_images:
585
+ logger.warning(
586
+ f"[{request_id}] No page images available, falling back to full Docling pipeline"
587
+ )
588
+ return _convert_document_full_docling(
589
+ input_path, output_dir, images_scale, include_images, request_id
590
  )
591
+
592
+ # --- PASS 1 (GPU): VLM OCR on all pages ---
593
+ logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
594
 
595
  vlm_page_texts: dict[int, Optional[str]] = {}
596
  vlm_start = time.time()
597
 
 
598
  max_workers = min(2, len(page_images))
599
+ logger.info(
600
+ f"[{request_id}] Sending {len(page_images)} pages to VLM ({max_workers} concurrent)"
601
+ )
602
 
603
  with ThreadPoolExecutor(max_workers=max_workers) as pool:
604
  futures = {
 
615
  )
616
  except Exception as e:
617
  logger.warning(
618
+ f"[{request_id}] VLM failed on page {page_no + 1}: {e}"
619
  )
620
  vlm_page_texts[page_no] = None
621
 
622
  vlm_time = time.time() - vlm_start
623
  logger.info(
624
+ f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)"
625
  )
626
 
627
+ # --- DETECT: Find pages with tables in VLM output ---
628
+ table_pages = _detect_table_pages(vlm_page_texts)
629
+ if table_pages:
630
+ logger.info(
631
+ f"[{request_id}] Tables detected on {len(table_pages)} pages: "
632
+ f"{sorted(p + 1 for p in table_pages)}"
633
+ )
634
+ else:
635
+ logger.info(f"[{request_id}] No tables detected — skipping Docling entirely")
636
 
637
+ # --- PASS 2 (CPU): Docling TableFormer ONLY on table pages ---
638
+ tables_by_page: dict[int, list[str]] = {}
639
+ pass2_time = 0.0
640
  image_count = 0
641
  image_dir = output_dir / "images"
642
 
643
+ if table_pages:
644
+ pass2_start = time.time()
645
+ logger.info(
646
+ f"[{request_id}] Pass 2: Docling TableFormer on {len(table_pages)} table pages"
647
+ )
648
 
649
+ try:
650
+ # Create mini-PDF containing only table pages
651
+ mini_pdf_path, page_map = _extract_pages_to_pdf(
652
+ input_path, sorted(table_pages), request_id
653
+ )
654
+
655
+ # Run Docling on mini-PDF (full pipeline for accurate table cell text)
656
+ converter = _get_converter()
657
+ result = converter.convert(mini_pdf_path)
658
+ doc = result.document
659
+
660
+ if doc:
661
+ # Extract tables, mapping mini-PDF pages back to original page numbers
662
+ for element, _ in doc.iterate_items():
663
+ if isinstance(element, TableItem):
664
+ mini_page = element.prov[0].page_no if element.prov else -1
665
+ orig_page = page_map.get(mini_page, mini_page)
666
+ table_md = element.export_to_markdown(doc=doc)
667
+ tables_by_page.setdefault(orig_page, []).append(table_md)
668
+
669
+ # Extract images from Docling if requested
670
+ if include_images:
671
+ image_dir.mkdir(parents=True, exist_ok=True)
672
+ for element, _ in doc.iterate_items():
673
+ if isinstance(element, PictureItem):
674
+ if element.image and element.image.pil_image:
675
+ pg = element.prov[0].page_no if element.prov else 0
676
+ orig_pg = page_map.get(pg, pg)
677
+ image_id = element.self_ref.split("/")[-1]
678
+ image_name = f"page_{orig_pg + 1}_{image_id}.png"
679
+ image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
680
+ image_path = image_dir / image_name
681
+ try:
682
+ element.image.pil_image.save(image_path, format="PNG")
683
+ image_count += 1
684
+ except Exception as e:
685
+ logger.warning(
686
+ f"[{request_id}] Failed to save image: {e}"
687
+ )
688
+
689
+ # Clean up mini-PDF
690
+ try:
691
+ os.unlink(mini_pdf_path)
692
+ except OSError:
693
+ pass
694
+
695
+ pass2_time = time.time() - pass2_start
696
+ total_tables = sum(len(v) for v in tables_by_page.values())
697
+ logger.info(
698
+ f"[{request_id}] Pass 2 completed in {pass2_time:.2f}s — "
699
+ f"{total_tables} TableFormer tables extracted"
700
+ )
701
+
702
+ except Exception as e:
703
+ pass2_time = time.time() - pass2_start
704
+ logger.warning(
705
+ f"[{request_id}] TableFormer pass failed ({e}), using VLM tables only"
706
+ )
707
+
708
+ # --- MERGE: VLM text + TableFormer tables ---
709
+ md_parts: list[str] = []
710
+ pages_seen: set[int] = set()
711
 
712
  for page_no in sorted(vlm_page_texts.keys()):
713
  pages_seen.add(page_no)
 
716
  vlm_text = vlm_page_texts[page_no]
717
 
718
  if vlm_text is None:
719
+ md_parts.append(f"<!-- VLM failed on this page -->\n")
 
 
 
 
 
 
 
720
  else:
 
721
  page_tables = tables_by_page.get(page_no, [])
722
+ if page_tables:
723
+ merged = _merge_vlm_with_tables(vlm_text, page_tables)
724
+ md_parts.append(merged)
725
+ else:
726
+ md_parts.append(vlm_text)
727
+
728
+ markdown_content = "".join(md_parts)
729
+ pages_processed = len(pages_seen)
730
+
731
+ total_time = time.time() - total_start
732
+ logger.info(
733
+ f"[{request_id}] VLM-first conversion complete: {pages_processed} pages — "
734
+ f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
735
+ f"TableFormer {pass2_time:.1f}s = {total_time:.2f}s total"
736
+ )
737
+ if pages_processed > 0:
738
+ logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
739
+
740
+ return markdown_content, None, pages_processed, image_count
741
+
742
+
743
+ def _convert_document_full_docling(
744
+ input_path: Path,
745
+ output_dir: Path,
746
+ images_scale: float,
747
+ include_images: bool,
748
+ request_id: str,
749
+ ) -> tuple:
750
+ """Fallback: Full Docling pipeline when page images are unavailable."""
751
+ logger.info(f"[{request_id}] Running full Docling pipeline (fallback mode)")
752
+
753
+ converter = _get_converter()
754
+ start_time = time.time()
755
+ result = converter.convert(input_path)
756
+ doc = result.document
757
 
758
+ if doc is None:
759
+ raise ValueError("Docling failed to parse document")
760
+
761
+ elapsed = time.time() - start_time
762
+ logger.info(f"[{request_id}] Docling completed in {elapsed:.2f}s")
763
+
764
+ markdown_content = doc.export_to_markdown()
765
+ pages_processed = len(
766
+ set(e.prov[0].page_no for e, _ in doc.iterate_items() if e.prov)
767
+ )
768
+
769
+ image_count = 0
770
  if include_images:
771
+ image_dir = output_dir / "images"
772
+ image_dir.mkdir(parents=True, exist_ok=True)
773
  for element, _ in doc.iterate_items():
774
  if isinstance(element, PictureItem):
775
  if element.image and element.image.pil_image:
 
781
  try:
782
  element.image.pil_image.save(image_path, format="PNG")
783
  image_count += 1
784
+ except Exception:
785
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
786
 
787
  return markdown_content, None, pages_processed, image_count
788
 
 
826
  async def lifespan(app: FastAPI):
827
  """Startup: initialize Docling converter and check vLLM."""
828
  logger.info("=" * 60)
829
+ logger.info("Starting Docling VLM Parser API v3.0.0...")
830
 
831
  device = _get_device()
832
  logger.info(f"Device: {device}")
 
863
  logger.warning(f"Failed to pre-load Docling models: {e}")
864
 
865
  logger.info("=" * 60)
866
+ logger.info("Docling VLM Parser API ready (VLM-first: Qwen3-VL + targeted TableFormer)")
867
  logger.info("=" * 60)
868
  yield
869
  logger.info("Shutting down Docling VLM Parser API...")
 
875
 
876
  app = FastAPI(
877
  title="Docling VLM Parser API",
878
+ description="VLM-first hybrid parser: Qwen3-VL OCR (GPU) + targeted TableFormer (CPU)",
879
+ version="3.0.0",
880
  lifespan=lifespan,
881
  )
882
 
 
905
 
906
  return HealthResponse(
907
  status="healthy",
908
+ version="3.0.0",
909
  device=device,
910
  gpu_name=None, # Don't leak GPU details on unauthenticated endpoint
911
  vlm_model="active", # Confirm VLM is configured without leaking model name
 
927
  """
928
  Parse a document file (PDF or image) and return extracted content.
929
 
930
+ Uses a VLM-first hybrid approach:
931
+ Pass 1 (GPU): Qwen3-VL via vLLM for OCR on all pages (concurrent)
932
+ Detect: Identify pages with tables from VLM output
933
+ Pass 2 (CPU): Docling TableFormer only on table pages
934
+ Merge: VLM text + TableFormer tables
935
 
936
  Supports:
937
  - PDF files (.pdf)
 
949
  if output_format not in ("markdown",):
950
  raise HTTPException(
951
  status_code=400,
952
+ detail="Only 'markdown' output_format is supported in v3.0.0",
953
  )
954
 
955
  # Validate file size
 
1073
  if request.output_format not in ("markdown",):
1074
  raise HTTPException(
1075
  status_code=400,
1076
+ detail="Only 'markdown' output_format is supported in v3.0.0",
1077
  )
1078
 
1079
  # Validate URL
requirements.txt CHANGED
@@ -26,5 +26,8 @@ onnxruntime>=1.19.0
26
  # PDF to image conversion for VLM OCR pass
27
  pdf2image>=1.17.0
28
 
 
 
 
29
  # HuggingFace Hub for model downloads
30
  huggingface-hub>=0.25.0
 
26
  # PDF to image conversion for VLM OCR pass
27
  pdf2image>=1.17.0
28
 
29
+ # PDF page extraction (for creating mini-PDFs with only table pages)
30
+ pypdf>=4.0.0
31
+
32
  # HuggingFace Hub for model downloads
33
  huggingface-hub>=0.25.0