sidoutcome commited on
Commit
53b94dc
·
1 Parent(s): c67903b

feat: v3.1.0 - DPI 150, parallel rendering, VLM retry, quality fixes

Browse files
Files changed (1) hide show
  1. app.py +385 -351
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Docling VLM Parser API v3.0.0
3
 
4
  A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
5
  Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
@@ -7,17 +7,15 @@ A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
7
  Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
8
  Merge: VLM text for all pages + TableFormer tables where detected
9
 
10
- Key insight: the previous architecture ran Docling's full CPU pipeline (DocLayNet +
11
- TableFormer + RapidOCR) on ALL pages, taking 60-565s. Most of that time was wasted
12
- on non-table pages. Now we run the fast GPU VLM first, detect which pages have tables,
13
- and only send those pages (as a mini-PDF) to Docling for table structure extraction.
14
-
15
- Features:
16
- - VLM-first: GPU-accelerated OCR on all pages via Qwen3-VL (concurrent)
17
- - Targeted TableFormer: CPU pipeline runs only on pages with tables
18
- - pypdf mini-PDF extraction for page-level Docling targeting
19
- - OpenCV image preprocessing (denoise, CLAHE contrast enhancement)
20
- - Image extraction with configurable resolution
21
  """
22
 
23
  import asyncio
@@ -97,6 +95,9 @@ VLM_PORT = os.getenv("VLM_PORT", "8000")
97
  IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
98
  MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
99
  MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
 
 
 
100
 
101
  # Blocked hostnames for SSRF protection
102
  BLOCKED_HOSTNAMES = {
@@ -203,7 +204,7 @@ class ParseResponse(BaseModel):
203
  success: bool
204
  markdown: Optional[str] = None
205
  json_content: Optional[Union[dict, list]] = None
206
- images_zip: Optional[str] = None # Base64-encoded zip file containing all images
207
  image_count: int = 0
208
  error: Optional[str] = None
209
  pages_processed: int = 0
@@ -229,29 +230,27 @@ class URLParseRequest(BaseModel):
229
  url: str
230
  output_format: str = "markdown"
231
  images_scale: Optional[float] = None
232
- start_page: int = 0 # Starting page (0-indexed)
233
- end_page: Optional[int] = None # Ending page (None = all pages)
234
  include_images: bool = False
235
 
236
 
237
  # ---------------------------------------------------------------------------
238
- # OpenCV Image Preprocessing
239
  # ---------------------------------------------------------------------------
240
 
241
 
242
  def _preprocess_image_for_ocr(image_path: str) -> str:
243
  """Enhance image quality for better OCR accuracy.
244
 
245
- Applies: deskew correction, denoising, CLAHE contrast enhancement.
246
- Returns the path to the preprocessed image (same path, overwritten).
 
247
  """
248
  img = cv2.imread(image_path)
249
  if img is None:
250
  return image_path
251
 
252
- # Denoise
253
- img = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 21)
254
-
255
  # CLAHE contrast enhancement on L channel
256
  lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
257
  l, a, b = cv2.split(lab)
@@ -265,243 +264,350 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
265
 
266
 
267
  # ---------------------------------------------------------------------------
268
- # VLM OCR (Pass 2)
269
  # ---------------------------------------------------------------------------
270
 
 
 
271
 
272
- def _vlm_ocr_page(page_image_bytes: bytes) -> str:
273
- """Send a page image to Qwen3-VL via vLLM for text extraction.
274
 
275
- Args:
276
- page_image_bytes: PNG image bytes of the page
277
 
278
- Returns:
279
- Extracted markdown text from the page
280
  """
281
  b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
282
 
283
- response = httpx.post(
284
- f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions",
285
- json={
286
- "model": VLM_MODEL,
287
- "messages": [
288
- {
289
- "role": "user",
290
- "content": [
291
- {
292
- "type": "image_url",
293
- "image_url": {"url": f"data:image/png;base64,{b64_image}"},
294
- },
295
- {
296
- "type": "text",
297
- "text": (
298
- "OCR this document page to markdown. "
299
- "Extract ALL text exactly as written, preserving headings, lists, and paragraphs. "
300
- "For tables, output them as markdown tables. "
301
- "For handwritten text, transcribe as accurately as possible. "
302
- "Return ONLY the extracted content, no explanations."
303
- ),
304
- },
305
- ],
306
- }
307
- ],
308
- "max_tokens": 16384,
309
- "temperature": 0.1,
310
- },
311
- timeout=120.0,
312
- )
313
- if response.status_code != 200:
314
- try:
315
- err = response.json()
316
- msg = err.get("message", err.get("detail", str(err)[:300]))
317
- except Exception:
318
- msg = response.text[:300]
319
- logger.error(f"vLLM error ({response.status_code}): {msg}")
320
- response.raise_for_status()
321
- result = response.json()
322
- choices = result.get("choices")
323
- if not choices:
324
- raise ValueError(f"vLLM returned no choices")
325
- content = choices[0].get("message", {}).get("content")
326
- if content is None:
327
- raise ValueError(f"vLLM response missing content")
328
- return content
329
-
330
 
331
- # ---------------------------------------------------------------------------
332
- # Table Extraction Helper
333
- # ---------------------------------------------------------------------------
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
- def _extract_table_markdowns(doc) -> dict:
337
- """Extract table markdown from Docling document, keyed by page number."""
338
- tables_by_page: dict[int, list[str]] = {}
339
- for element, _ in doc.iterate_items():
340
- if isinstance(element, TableItem):
341
- page_no = element.prov[0].page_no if element.prov else -1
342
- table_md = element.export_to_markdown(doc=doc)
343
- if page_no not in tables_by_page:
344
- tables_by_page[page_no] = []
345
- tables_by_page[page_no].append(table_md)
346
- return tables_by_page
347
 
348
 
349
  # ---------------------------------------------------------------------------
350
- # Merge: VLM Text + TableFormer Tables
351
  # ---------------------------------------------------------------------------
352
 
 
 
 
 
353
 
354
- def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list) -> str:
355
- """Replace VLM's table sections with TableFormer's more accurate tables.
356
-
357
- Detects markdown table patterns (lines with |...|) in VLM output
358
- and replaces them with TableFormer output.
359
- """
360
- if not table_markdowns:
361
- return vlm_text
362
-
363
- # Pattern: consecutive lines that look like markdown tables
364
- # A markdown table has lines starting and ending with |
365
- table_pattern = re.compile(r"((?:^\|[^\n]+\|$\n?)+)", re.MULTILINE)
366
-
367
- vlm_table_count = len(table_pattern.findall(vlm_text))
368
- if vlm_table_count != len(table_markdowns):
369
- logger.warning(
370
- f"Table count mismatch: VLM={vlm_table_count}, TableFormer={len(table_markdowns)}. "
371
- f"Positional replacement may be imprecise."
372
- )
373
-
374
- table_idx = 0
375
-
376
- def replace_table(match):
377
- nonlocal table_idx
378
- if table_idx < len(table_markdowns):
379
- replacement = table_markdowns[table_idx]
380
- table_idx += 1
381
- return replacement.strip() + "\n"
382
- return match.group(0)
383
-
384
- result = table_pattern.sub(replace_table, vlm_text)
385
-
386
- # If there are remaining TableFormer tables not matched, append them
387
- while table_idx < len(table_markdowns):
388
- result += "\n\n" + table_markdowns[table_idx].strip() + "\n"
389
- table_idx += 1
390
-
391
- return result
392
-
393
-
394
- # ---------------------------------------------------------------------------
395
- # Table Detection from VLM Output
396
- # ---------------------------------------------------------------------------
397
 
398
 
399
  def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
400
  """Detect pages containing tables from VLM markdown output.
401
 
402
- Looks for markdown table separator rows (e.g., | --- | --- |) which are
403
- a reliable signal of table content. Returns set of 0-indexed page numbers.
404
  """
405
- # Markdown table separator: | --- | --- | (with optional colons for alignment)
406
- separator_pattern = re.compile(r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE)
407
  table_pages: set[int] = set()
408
  for page_no, text in vlm_page_texts.items():
409
- if text and separator_pattern.search(text):
 
 
410
  table_pages.add(page_no)
411
  return table_pages
412
 
413
 
 
 
 
 
 
414
  def _extract_pages_to_pdf(
415
  input_path: Path, page_numbers: list[int], request_id: str
416
  ) -> tuple[Path, dict[int, int]]:
417
- """Extract specific pages from a PDF into a mini-PDF.
418
 
419
  Args:
420
  input_path: Path to the original PDF
421
  page_numbers: 0-indexed page numbers to extract
422
- request_id: For logging
423
 
424
  Returns:
425
- (mini_pdf_path, page_map) where page_map maps Docling 1-indexed pages
426
- in the mini-PDF back to 0-indexed original page numbers.
427
  """
428
  from pypdf import PdfReader, PdfWriter
429
 
430
  reader = PdfReader(str(input_path))
431
  writer = PdfWriter()
432
 
433
- # page_map: {docling_1indexed_mini_page: original_0indexed_page}
434
  page_map: dict[int, int] = {}
435
- for idx, orig_page in enumerate(page_numbers):
 
436
  if orig_page < len(reader.pages):
437
  writer.add_page(reader.pages[orig_page])
438
  page_map[idx + 1] = orig_page # Docling uses 1-indexed pages
 
 
 
 
439
 
440
  mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
441
  with open(mini_pdf_path, "wb") as f:
442
  writer.write(f)
443
 
444
- logger.info(f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original")
 
 
445
  return mini_pdf_path, page_map
446
 
447
 
448
  # ---------------------------------------------------------------------------
449
- # PDF to Page Images
450
  # ---------------------------------------------------------------------------
451
 
452
 
453
- def _pdf_to_page_images(
454
- input_path: Path, start_page: int = 0, end_page: Optional[int] = None
455
- ) -> list:
456
- """Convert PDF pages to PNG image bytes using pdf2image.
457
 
458
- Processes one page at a time to avoid loading all pages into memory.
459
- Returns list of (page_no, png_bytes) tuples.
460
  """
461
- page_images: list[tuple[int, bytes]] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
  try:
464
- # Determine total page count first
465
  from pdf2image.pdf2image import pdfinfo_from_path
466
 
467
  info = pdfinfo_from_path(str(input_path))
468
  total_pages = info["Pages"]
469
  last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
470
-
471
- for i in range(start_page, last_page):
472
- # Convert one page at a time (pdf2image is 1-indexed)
473
- images = convert_from_path(
474
- str(input_path), dpi=300, first_page=i + 1, last_page=i + 1
475
- )
476
- if not images:
477
- continue
478
- img = images[0]
479
- # Save to temp file for OpenCV preprocessing
480
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
481
- tmp_path = tmp.name
482
- img.save(tmp_path, format="PNG")
483
- try:
484
- _preprocess_image_for_ocr(tmp_path)
485
- with open(tmp_path, "rb") as f:
486
- page_images.append((i, f.read()))
487
- finally:
488
- os.unlink(tmp_path)
489
  except Exception as e:
490
- # Fallback: log warning caller handles empty list
491
- logger.warning(f"pdf2image failed, VLM OCR may be limited: {e}")
 
 
492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  return page_images
494
 
495
 
496
  # ---------------------------------------------------------------------------
497
- # Docling Converter (Pass 1)
498
  # ---------------------------------------------------------------------------
499
 
500
 
501
  def _create_converter(images_scale: float = 2.0) -> DocumentConverter:
502
  """Create a Docling converter with Standard Pipeline.
503
 
504
- Uses DocLayNet (layout) + TableFormer ACCURATE (tables) + RapidOCR (baseline text).
505
  """
506
  device = _get_device()
507
  logger.info(f"Creating converter with device: {device}")
@@ -512,15 +618,11 @@ def _create_converter(images_scale: float = 2.0) -> DocumentConverter:
512
  pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
513
  pipeline_options.table_structure_options.do_cell_matching = True
514
 
515
- # Use RapidOCR as baseline (VLM will enhance text in pass 2)
516
  pipeline_options.ocr_options = RapidOcrOptions()
517
  pipeline_options.ocr_options.force_full_page_ocr = True
518
 
519
- # Enable page image generation (needed for VLM pass)
520
  pipeline_options.generate_page_images = True
521
  pipeline_options.images_scale = images_scale
522
-
523
- # Also enable picture image extraction
524
  pipeline_options.generate_picture_images = True
525
 
526
  pipeline_options.accelerator_options = AcceleratorOptions(
@@ -548,7 +650,7 @@ def _get_converter() -> DocumentConverter:
548
 
549
 
550
  # ---------------------------------------------------------------------------
551
- # Hybrid Conversion (Pass 1 + Pass 2 + Merge)
552
  # ---------------------------------------------------------------------------
553
 
554
 
@@ -562,51 +664,42 @@ def _convert_document(
562
  end_page: Optional[int] = None,
563
  ) -> tuple:
564
  """
565
- VLM-first hybrid conversion: Qwen3-VL for text + targeted TableFormer for tables.
566
 
567
- Pass 1 (GPU): VLM OCR on ALL pages fast concurrent processing
568
- Detect: Identify pages with tables from VLM markdown output
569
- Pass 2 (CPU): Docling TableFormer ONLY on table pages — minimal CPU work
570
- Merge: VLM text + TableFormer tables
571
 
572
  Returns: (markdown_content, json_content, pages_processed, image_count)
573
  """
574
- total_start = time.time()
575
 
576
- # --- RENDER: Convert PDF pages to images ---
577
- render_start = time.time()
578
- page_images = _pdf_to_page_images(input_path, start_page, end_page)
579
- render_time = time.time() - render_start
580
- logger.info(
581
- f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s"
582
- )
583
 
584
  if not page_images:
585
- logger.warning(
586
- f"[{request_id}] No page images available, falling back to full Docling pipeline"
587
- )
588
  return _convert_document_full_docling(
589
  input_path, output_dir, images_scale, include_images, request_id
590
  )
591
 
592
- # --- PASS 1 (GPU): VLM OCR on all pages ---
 
 
593
  logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
 
594
 
595
  vlm_page_texts: dict[int, Optional[str]] = {}
596
  vlm_start = time.time()
597
 
598
- max_workers = min(2, len(page_images))
599
- logger.info(
600
- f"[{request_id}] Sending {len(page_images)} pages to VLM ({max_workers} concurrent)"
601
- )
602
-
603
- with ThreadPoolExecutor(max_workers=max_workers) as pool:
604
- futures = {
605
- pool.submit(_vlm_ocr_page, page_bytes): page_no
606
  for page_no, page_bytes in page_images
607
  }
608
- for future in as_completed(futures):
609
- page_no = futures[future]
610
  try:
611
  vlm_text = future.result()
612
  vlm_page_texts[page_no] = vlm_text
@@ -614,18 +707,15 @@ def _convert_document(
614
  f"[{request_id}] VLM processed page {page_no + 1} ({len(vlm_text)} chars)"
615
  )
616
  except Exception as e:
617
- logger.warning(
618
- f"[{request_id}] VLM failed on page {page_no + 1}: {e}"
619
- )
620
  vlm_page_texts[page_no] = None
621
 
622
  vlm_time = time.time() - vlm_start
623
- logger.info(
624
- f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)"
625
- )
626
 
627
- # --- DETECT: Find pages with tables in VLM output ---
628
  table_pages = _detect_table_pages(vlm_page_texts)
 
629
  if table_pages:
630
  logger.info(
631
  f"[{request_id}] Tables detected on {len(table_pages)} pages: "
@@ -634,90 +724,57 @@ def _convert_document(
634
  else:
635
  logger.info(f"[{request_id}] No tables detected — skipping Docling entirely")
636
 
637
- # --- PASS 2 (CPU): Docling TableFormer ONLY on table pages ---
638
  tables_by_page: dict[int, list[str]] = {}
639
- pass2_time = 0.0
640
- image_count = 0
641
- image_dir = output_dir / "images"
642
 
643
  if table_pages:
644
- pass2_start = time.time()
645
  logger.info(
646
  f"[{request_id}] Pass 2: Docling TableFormer on {len(table_pages)} table pages"
647
  )
 
648
 
649
  try:
650
- # Create mini-PDF containing only table pages
651
  mini_pdf_path, page_map = _extract_pages_to_pdf(
652
  input_path, sorted(table_pages), request_id
653
  )
654
 
655
- # Run Docling on mini-PDF (full pipeline for accurate table cell text)
656
  converter = _get_converter()
657
  result = converter.convert(mini_pdf_path)
658
  doc = result.document
659
 
660
- if doc:
661
- # Extract tables, mapping mini-PDF pages back to original page numbers
662
- for element, _ in doc.iterate_items():
663
- if isinstance(element, TableItem):
664
- mini_page = element.prov[0].page_no if element.prov else -1
665
- orig_page = page_map.get(mini_page, mini_page)
666
- table_md = element.export_to_markdown(doc=doc)
667
- tables_by_page.setdefault(orig_page, []).append(table_md)
668
-
669
- # Extract images from Docling if requested
670
- if include_images:
671
- image_dir.mkdir(parents=True, exist_ok=True)
672
- for element, _ in doc.iterate_items():
673
- if isinstance(element, PictureItem):
674
- if element.image and element.image.pil_image:
675
- pg = element.prov[0].page_no if element.prov else 0
676
- orig_pg = page_map.get(pg, pg)
677
- image_id = element.self_ref.split("/")[-1]
678
- image_name = f"page_{orig_pg + 1}_{image_id}.png"
679
- image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
680
- image_path = image_dir / image_name
681
- try:
682
- element.image.pil_image.save(image_path, format="PNG")
683
- image_count += 1
684
- except Exception as e:
685
- logger.warning(
686
- f"[{request_id}] Failed to save image: {e}"
687
- )
688
 
689
  # Clean up mini-PDF
690
- try:
691
- os.unlink(mini_pdf_path)
692
- except OSError:
693
- pass
694
-
695
- pass2_time = time.time() - pass2_start
696
- total_tables = sum(len(v) for v in tables_by_page.values())
697
- logger.info(
698
- f"[{request_id}] Pass 2 completed in {pass2_time:.2f}s — "
699
- f"{total_tables} TableFormer tables extracted"
700
- )
701
 
702
  except Exception as e:
703
- pass2_time = time.time() - pass2_start
704
- logger.warning(
705
- f"[{request_id}] TableFormer pass failed ({e}), using VLM tables only"
706
- )
707
 
708
- # --- MERGE: VLM text + TableFormer tables ---
709
  md_parts: list[str] = []
710
- pages_seen: set[int] = set()
711
 
712
  for page_no in sorted(vlm_page_texts.keys()):
713
- pages_seen.add(page_no)
714
  md_parts.append(f"\n\n<!-- Page {page_no + 1} -->\n\n")
715
 
716
  vlm_text = vlm_page_texts[page_no]
717
 
718
  if vlm_text is None:
719
- md_parts.append(f"<!-- VLM failed on this page -->\n")
 
720
  else:
 
721
  page_tables = tables_by_page.get(page_no, [])
722
  if page_tables:
723
  merged = _merge_vlm_with_tables(vlm_text, page_tables)
@@ -725,14 +782,39 @@ def _convert_document(
725
  else:
726
  md_parts.append(vlm_text)
727
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
  markdown_content = "".join(md_parts)
729
- pages_processed = len(pages_seen)
 
730
 
731
- total_time = time.time() - total_start
732
  logger.info(
733
  f"[{request_id}] VLM-first conversion complete: {pages_processed} pages — "
734
  f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
735
- f"TableFormer {pass2_time:.1f}s = {total_time:.2f}s total"
736
  )
737
  if pages_processed > 0:
738
  logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
@@ -747,19 +829,18 @@ def _convert_document_full_docling(
747
  include_images: bool,
748
  request_id: str,
749
  ) -> tuple:
750
- """Fallback: Full Docling pipeline when page images are unavailable."""
751
- logger.info(f"[{request_id}] Running full Docling pipeline (fallback mode)")
752
-
753
  converter = _get_converter()
 
754
  start_time = time.time()
755
  result = converter.convert(input_path)
756
  doc = result.document
757
-
758
  if doc is None:
759
  raise ValueError("Docling failed to parse document")
760
 
761
  elapsed = time.time() - start_time
762
- logger.info(f"[{request_id}] Docling completed in {elapsed:.2f}s")
763
 
764
  markdown_content = doc.export_to_markdown()
765
  pages_processed = len(
@@ -773,9 +854,9 @@ def _convert_document_full_docling(
773
  for element, _ in doc.iterate_items():
774
  if isinstance(element, PictureItem):
775
  if element.image and element.image.pil_image:
776
- page_no = element.prov[0].page_no if element.prov else 0
777
  image_id = element.self_ref.split("/")[-1]
778
- image_name = f"page_{page_no + 1}_{image_id}.png"
779
  image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
780
  image_path = image_dir / image_name
781
  try:
@@ -826,7 +907,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
826
  async def lifespan(app: FastAPI):
827
  """Startup: initialize Docling converter and check vLLM."""
828
  logger.info("=" * 60)
829
- logger.info("Starting Docling VLM Parser API v3.0.0...")
830
 
831
  device = _get_device()
832
  logger.info(f"Device: {device}")
@@ -835,11 +916,13 @@ async def lifespan(app: FastAPI):
835
  logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
836
  logger.info(f"CUDA Version: {torch.version.cuda}")
837
  logger.info(
838
- f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB"
839
  )
840
 
841
  logger.info(f"VLM Model: {VLM_MODEL}")
842
  logger.info(f"VLM Endpoint: http://{VLM_HOST}:{VLM_PORT}")
 
 
843
  logger.info(f"Images scale: {IMAGES_SCALE}")
844
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
845
 
@@ -875,8 +958,8 @@ async def lifespan(app: FastAPI):
875
 
876
  app = FastAPI(
877
  title="Docling VLM Parser API",
878
- description="VLM-first hybrid parser: Qwen3-VL OCR (GPU) + targeted TableFormer (CPU)",
879
- version="3.0.0",
880
  lifespan=lifespan,
881
  )
882
 
@@ -890,11 +973,7 @@ app = FastAPI(
890
  async def health_check() -> HealthResponse:
891
  """Health check endpoint."""
892
  device = _get_device()
893
- gpu_name = None
894
- if device == "cuda":
895
- gpu_name = torch.cuda.get_device_name(0)
896
 
897
- # Check vLLM status (async to avoid blocking event loop)
898
  vlm_status = "unknown"
899
  try:
900
  async with httpx.AsyncClient(timeout=5) as client:
@@ -905,10 +984,10 @@ async def health_check() -> HealthResponse:
905
 
906
  return HealthResponse(
907
  status="healthy",
908
- version="3.0.0",
909
  device=device,
910
- gpu_name=None, # Don't leak GPU details on unauthenticated endpoint
911
- vlm_model="active", # Confirm VLM is configured without leaking model name
912
  vlm_status=vlm_status,
913
  images_scale=IMAGES_SCALE,
914
  )
@@ -918,25 +997,13 @@ async def health_check() -> HealthResponse:
918
  async def parse_document(
919
  file: UploadFile = File(..., description="PDF or image file to parse"),
920
  output_format: str = Form(default="markdown", description="Output format: markdown or json"),
921
- images_scale: Optional[float] = Form(default=None, description="Image resolution scale (default: 2.0)"),
922
  start_page: int = Form(default=0, description="Starting page (0-indexed)"),
923
  end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"),
924
- include_images: bool = Form(default=False, description="Include extracted images in response"),
925
  _token: str = Depends(verify_token),
926
  ) -> ParseResponse:
927
- """
928
- Parse a document file (PDF or image) and return extracted content.
929
-
930
- Uses a VLM-first hybrid approach:
931
- Pass 1 (GPU): Qwen3-VL via vLLM for OCR on all pages (concurrent)
932
- Detect: Identify pages with tables from VLM output
933
- Pass 2 (CPU): Docling TableFormer only on table pages
934
- Merge: VLM text + TableFormer tables
935
-
936
- Supports:
937
- - PDF files (.pdf)
938
- - Images (.png, .jpg, .jpeg, .tiff, .bmp)
939
- """
940
  request_id = str(uuid4())[:8]
941
  start_time = time.time()
942
 
@@ -949,7 +1016,7 @@ async def parse_document(
949
  if output_format not in ("markdown",):
950
  raise HTTPException(
951
  status_code=400,
952
- detail="Only 'markdown' output_format is supported in v3.0.0",
953
  )
954
 
955
  # Validate file size
@@ -961,7 +1028,6 @@ async def parse_document(
961
  logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
962
 
963
  if file_size > MAX_FILE_SIZE_BYTES:
964
- logger.error(f"[{request_id}] File too large: {file_size_mb:.2f} MB > {MAX_FILE_SIZE_MB} MB")
965
  raise HTTPException(
966
  status_code=413,
967
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
@@ -971,32 +1037,25 @@ async def parse_document(
971
  allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
972
  file_ext = Path(file.filename).suffix.lower() if file.filename else ""
973
  if file_ext not in allowed_extensions:
974
- logger.error(f"[{request_id}] Unsupported file type: {file_ext}")
975
  raise HTTPException(
976
  status_code=400,
977
  detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
978
  )
979
 
980
- # Use defaults if not specified
981
  use_images_scale = images_scale if images_scale is not None else IMAGES_SCALE
982
 
983
  logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
984
  logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
985
 
986
  temp_dir = tempfile.mkdtemp()
987
- logger.debug(f"[{request_id}] Created temp directory: {temp_dir}")
988
 
989
  try:
990
- # Save uploaded file
991
  input_path = Path(temp_dir) / f"input{file_ext}"
992
  await asyncio.to_thread(_save_uploaded_file, input_path, file.file)
993
- logger.debug(f"[{request_id}] Saved file to: {input_path}")
994
 
995
- # Create output directory
996
  output_dir = Path(temp_dir) / "output"
997
  output_dir.mkdir(exist_ok=True)
998
 
999
- # Convert document (hybrid two-pass)
1000
  markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
1001
  _convert_document,
1002
  input_path,
@@ -1008,11 +1067,9 @@ async def parse_document(
1008
  end_page,
1009
  )
1010
 
1011
- # Create images zip if requested
1012
  images_zip = None
1013
  if include_images and image_count > 0:
1014
  images_zip, image_count = _create_images_zip(output_dir)
1015
- logger.info(f"[{request_id}] Created images zip with {image_count} images")
1016
 
1017
  total_duration = time.time() - start_time
1018
  logger.info(f"[{request_id}] {'='*50}")
@@ -1046,7 +1103,6 @@ async def parse_document(
1046
  )
1047
  finally:
1048
  shutil.rmtree(temp_dir, ignore_errors=True)
1049
- logger.debug(f"[{request_id}] Cleaned up temp directory")
1050
 
1051
 
1052
  @app.post("/parse/url", response_model=ParseResponse)
@@ -1054,14 +1110,7 @@ async def parse_document_from_url(
1054
  request: URLParseRequest,
1055
  _token: str = Depends(verify_token),
1056
  ) -> ParseResponse:
1057
- """
1058
- Parse a document from a URL.
1059
-
1060
- Downloads the file and processes it through the hybrid two-pass pipeline:
1061
- Pass 1: Docling Standard Pipeline (DocLayNet + TableFormer + RapidOCR)
1062
- Pass 2: Qwen3-VL via vLLM for enhanced text recognition
1063
- Merge: TableFormer tables preserved, VLM text replaces RapidOCR text
1064
- """
1065
  request_id = str(uuid4())[:8]
1066
  start_time = time.time()
1067
 
@@ -1073,16 +1122,12 @@ async def parse_document_from_url(
1073
  if request.output_format not in ("markdown",):
1074
  raise HTTPException(
1075
  status_code=400,
1076
- detail="Only 'markdown' output_format is supported in v3.0.0",
1077
  )
1078
 
1079
- # Validate URL
1080
- logger.info(f"[{request_id}] Validating URL...")
1081
  _validate_url(request.url)
1082
- logger.info(f"[{request_id}] URL validation passed")
1083
 
1084
  temp_dir = tempfile.mkdtemp()
1085
- logger.debug(f"[{request_id}] Created temp directory: {temp_dir}")
1086
 
1087
  try:
1088
  # Download file
@@ -1091,19 +1136,18 @@ async def parse_document_from_url(
1091
  async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
1092
  response = await client.get(request.url)
1093
  response.raise_for_status()
1094
- download_duration = time.time() - download_start
1095
 
1096
  file_size_mb = len(response.content) / (1024 * 1024)
1097
- logger.info(f"[{request_id}] Download completed in {download_duration:.2f}s")
1098
- logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
 
 
1099
 
1100
- # Determine file extension from URL path, Content-Type header, or default to .pdf
1101
- allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
1102
  url_path = Path(request.url.split("?")[0])
1103
  file_ext = url_path.suffix.lower()
1104
 
1105
- if file_ext not in allowed_extensions:
1106
- # Try Content-Type header
1107
  content_type = response.headers.get("content-type", "").lower()
1108
  ct_map = {
1109
  "application/pdf": ".pdf",
@@ -1113,33 +1157,26 @@ async def parse_document_from_url(
1113
  "image/bmp": ".bmp",
1114
  }
1115
  file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
1116
- logger.info(f"[{request_id}] URL suffix not recognized, using: {file_ext} (from content-type: {content_type})")
1117
 
1118
  if len(response.content) > MAX_FILE_SIZE_BYTES:
1119
- logger.error(
1120
- f"[{request_id}] File too large: {file_size_mb:.2f} MB > {MAX_FILE_SIZE_MB} MB"
1121
- )
1122
  raise HTTPException(
1123
  status_code=413,
1124
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
1125
  )
1126
 
1127
- # Save downloaded file
1128
  input_path = Path(temp_dir) / f"input{file_ext}"
1129
  await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
1130
- logger.debug(f"[{request_id}] Saved file to: {input_path}")
1131
 
1132
- # Create output directory
1133
  output_dir = Path(temp_dir) / "output"
1134
  output_dir.mkdir(exist_ok=True)
1135
 
1136
- # Use defaults if not specified
1137
  use_images_scale = request.images_scale if request.images_scale is not None else IMAGES_SCALE
1138
 
1139
  logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
1140
- logger.info(f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}")
 
 
1141
 
1142
- # Convert document (hybrid two-pass)
1143
  markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
1144
  _convert_document,
1145
  input_path,
@@ -1151,11 +1188,9 @@ async def parse_document_from_url(
1151
  request.end_page,
1152
  )
1153
 
1154
- # Create images zip if requested
1155
  images_zip = None
1156
  if request.include_images and image_count > 0:
1157
  images_zip, image_count = _create_images_zip(output_dir)
1158
- logger.info(f"[{request_id}] Created images zip with {image_count} images")
1159
 
1160
  total_duration = time.time() - start_time
1161
  logger.info(f"[{request_id}] {'='*50}")
@@ -1196,7 +1231,6 @@ async def parse_document_from_url(
1196
  )
1197
  finally:
1198
  shutil.rmtree(temp_dir, ignore_errors=True)
1199
- logger.debug(f"[{request_id}] Cleaned up temp directory")
1200
 
1201
 
1202
  if __name__ == "__main__":
 
1
  """
2
+ Docling VLM Parser API v3.1.0
3
 
4
  A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
5
  Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
 
7
  Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
8
  Merge: VLM text for all pages + TableFormer tables where detected
9
 
10
+ v3.1.0 fixes over v3.0.0:
11
+ - Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
12
+ - Quality: VLM retry on timeout/failure (1 retry with longer timeout)
13
+ - Quality: Table detection catches both markdown and LaTeX table patterns
14
+ - Quality: Proper page_map translation for mini-PDF → original page numbers
15
+ - Speed: DPI 200 (from 300) — sufficient for VLM, 55% fewer pixels
16
+ - Speed: Dropped fastNlMeansDenoisingColored (saves ~10s/page), kept only CLAHE
17
+ - Speed: Parallel page rendering via ThreadPoolExecutor
18
+ - Speed: Increased VLM concurrency from 2 to 4 workers
 
 
19
  """
20
 
21
  import asyncio
 
95
  IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
96
  MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
97
  MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
98
+ VLM_TIMEOUT = float(os.getenv("VLM_TIMEOUT", "300"))
99
+ VLM_CONCURRENCY = int(os.getenv("VLM_CONCURRENCY", "4"))
100
+ RENDER_DPI = int(os.getenv("RENDER_DPI", "150"))
101
 
102
  # Blocked hostnames for SSRF protection
103
  BLOCKED_HOSTNAMES = {
 
204
  success: bool
205
  markdown: Optional[str] = None
206
  json_content: Optional[Union[dict, list]] = None
207
+ images_zip: Optional[str] = None
208
  image_count: int = 0
209
  error: Optional[str] = None
210
  pages_processed: int = 0
 
230
  url: str
231
  output_format: str = "markdown"
232
  images_scale: Optional[float] = None
233
+ start_page: int = 0
234
+ end_page: Optional[int] = None
235
  include_images: bool = False
236
 
237
 
238
  # ---------------------------------------------------------------------------
239
+ # OpenCV Image Preprocessing (CLAHE only — fast)
240
  # ---------------------------------------------------------------------------
241
 
242
 
243
  def _preprocess_image_for_ocr(image_path: str) -> str:
244
  """Enhance image quality for better OCR accuracy.
245
 
246
+ Applies CLAHE contrast enhancement only (fast).
247
+ Denoising was removed in v3.1.0 it added ~10s/page with minimal
248
+ benefit for VLM-based OCR which handles noise well.
249
  """
250
  img = cv2.imread(image_path)
251
  if img is None:
252
  return image_path
253
 
 
 
 
254
  # CLAHE contrast enhancement on L channel
255
  lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
256
  l, a, b = cv2.split(lab)
 
264
 
265
 
266
  # ---------------------------------------------------------------------------
267
+ # VLM OCR with retry
268
  # ---------------------------------------------------------------------------
269
 
270
+ # Strip Qwen3 <think>...</think> reasoning blocks
271
+ _THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
272
 
 
 
273
 
274
+ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
275
+ """Send a page image to Qwen3-VL via vLLM for text extraction.
276
 
277
+ Includes retry logic: on timeout/failure, retries once with longer timeout.
278
+ Strips <think> reasoning tokens from Qwen3 output.
279
  """
280
  b64_image = base64.b64encode(page_image_bytes).decode("utf-8")
281
 
282
+ payload = {
283
+ "model": VLM_MODEL,
284
+ "messages": [
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "image_url",
290
+ "image_url": {"url": f"data:image/png;base64,{b64_image}"},
291
+ },
292
+ {
293
+ "type": "text",
294
+ "text": (
295
+ "OCR this document page to markdown. "
296
+ "Extract ALL text exactly as written, preserving headings, lists, and paragraphs. "
297
+ "For tables, output them as MARKDOWN tables using | delimiters and --- separator rows. "
298
+ "NEVER use LaTeX tabular format. ALWAYS use markdown pipe tables. "
299
+ "For handwritten text, transcribe as accurately as possible. "
300
+ "Return ONLY the extracted content, no explanations or commentary."
301
+ ),
302
+ },
303
+ ],
304
+ }
305
+ ],
306
+ "max_tokens": 16384,
307
+ "temperature": 0.1,
308
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
+ url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
 
 
311
 
312
+ # Try with primary timeout, then retry once with extended timeout
313
+ for attempt, timeout in enumerate([VLM_TIMEOUT, VLM_TIMEOUT * 1.5], start=1):
314
+ try:
315
+ response = httpx.post(url, json=payload, timeout=timeout)
316
+ if response.status_code != 200:
317
+ try:
318
+ err = response.json()
319
+ msg = err.get("message", err.get("detail", str(err)[:300]))
320
+ except Exception:
321
+ msg = response.text[:300]
322
+ logger.error(f"[{request_id}] vLLM error ({response.status_code}) page {page_no}: {msg}")
323
+ if attempt == 1:
324
+ logger.info(f"[{request_id}] Retrying page {page_no}...")
325
+ continue
326
+ response.raise_for_status()
327
+
328
+ result = response.json()
329
+ choices = result.get("choices")
330
+ if not choices:
331
+ raise ValueError("vLLM returned no choices")
332
+ content = choices[0].get("message", {}).get("content")
333
+ if content is None:
334
+ raise ValueError("vLLM response missing content")
335
+
336
+ # Strip <think>...</think> reasoning blocks from Qwen3
337
+ content = _THINK_PATTERN.sub("", content).strip()
338
+
339
+ return content
340
+
341
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
342
+ if attempt == 1:
343
+ logger.warning(
344
+ f"[{request_id}] VLM attempt {attempt} failed on page {page_no}: {e}. Retrying..."
345
+ )
346
+ continue
347
+ raise
348
 
349
+ raise RuntimeError(f"VLM failed after 2 attempts on page {page_no}")
 
 
 
 
 
 
 
 
 
 
350
 
351
 
352
  # ---------------------------------------------------------------------------
353
+ # Table Detection from VLM Output
354
  # ---------------------------------------------------------------------------
355
 
356
+ # Markdown table separator: | --- | --- | or |:---:|---:|
357
+ _MD_TABLE_SEPARATOR = re.compile(
358
+ r"^\|[\s\-:]+(?:\|[\s\-:]+)+\|?\s*$", re.MULTILINE
359
+ )
360
 
361
+ # LaTeX table markers (fallback if VLM ignores markdown instruction)
362
+ _LATEX_TABLE_PATTERN = re.compile(r"\\begin\{tabular\}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
 
365
  def _detect_table_pages(vlm_page_texts: dict[int, Optional[str]]) -> set[int]:
366
  """Detect pages containing tables from VLM markdown output.
367
 
368
+ Checks for both markdown table separators and LaTeX tabular markers.
 
369
  """
 
 
370
  table_pages: set[int] = set()
371
  for page_no, text in vlm_page_texts.items():
372
+ if text and (
373
+ _MD_TABLE_SEPARATOR.search(text) or _LATEX_TABLE_PATTERN.search(text)
374
+ ):
375
  table_pages.add(page_no)
376
  return table_pages
377
 
378
 
379
+ # ---------------------------------------------------------------------------
380
+ # Mini-PDF Extraction (pypdf)
381
+ # ---------------------------------------------------------------------------
382
+
383
+
384
  def _extract_pages_to_pdf(
385
  input_path: Path, page_numbers: list[int], request_id: str
386
  ) -> tuple[Path, dict[int, int]]:
387
+ """Extract specific pages from a PDF into a mini-PDF using pypdf.
388
 
389
  Args:
390
  input_path: Path to the original PDF
391
  page_numbers: 0-indexed page numbers to extract
392
+ request_id: Request ID for logging
393
 
394
  Returns:
395
+ (mini_pdf_path, page_map) where page_map maps Docling 1-indexed
396
+ page numbers in the mini-PDF back to 0-indexed original page numbers.
397
  """
398
  from pypdf import PdfReader, PdfWriter
399
 
400
  reader = PdfReader(str(input_path))
401
  writer = PdfWriter()
402
 
403
+ # page_map: {docling_page_no (1-indexed in mini-PDF) → original_page_no (0-indexed)}
404
  page_map: dict[int, int] = {}
405
+
406
+ for idx, orig_page in enumerate(sorted(page_numbers)):
407
  if orig_page < len(reader.pages):
408
  writer.add_page(reader.pages[orig_page])
409
  page_map[idx + 1] = orig_page # Docling uses 1-indexed pages
410
+ else:
411
+ logger.warning(
412
+ f"[{request_id}] Page {orig_page} out of range (total: {len(reader.pages)})"
413
+ )
414
 
415
  mini_pdf_path = input_path.parent / f"table_pages_{request_id}.pdf"
416
  with open(mini_pdf_path, "wb") as f:
417
  writer.write(f)
418
 
419
+ logger.info(
420
+ f"[{request_id}] Created mini-PDF: {len(page_map)} table pages from original"
421
+ )
422
  return mini_pdf_path, page_map
423
 
424
 
425
  # ---------------------------------------------------------------------------
426
+ # Table Extraction from Docling
427
  # ---------------------------------------------------------------------------
428
 
429
 
430
+ def _extract_table_markdowns(doc, page_map: dict[int, int]) -> dict[int, list[str]]:
431
+ """Extract table markdown from Docling document, keyed by ORIGINAL page number.
 
 
432
 
433
+ Uses page_map to translate from Docling's 1-indexed mini-PDF pages
434
+ back to the original 0-indexed page numbers.
435
  """
436
+ tables_by_page: dict[int, list[str]] = {}
437
+ for element, _ in doc.iterate_items():
438
+ if isinstance(element, TableItem):
439
+ docling_page = element.prov[0].page_no if element.prov else -1
440
+ # Translate mini-PDF page → original page
441
+ orig_page = page_map.get(docling_page, docling_page - 1)
442
+ table_md = element.export_to_markdown(doc=doc)
443
+ if orig_page not in tables_by_page:
444
+ tables_by_page[orig_page] = []
445
+ tables_by_page[orig_page].append(table_md)
446
+ return tables_by_page
447
+
448
+
449
+ # ---------------------------------------------------------------------------
450
+ # Merge: VLM Text + TableFormer Tables
451
+ # ---------------------------------------------------------------------------
452
+
453
+ # Consecutive lines with | delimiters (markdown tables)
454
+ _VLM_TABLE_BLOCK = re.compile(r"((?:^\|[^\n]+\|$\n?)+)", re.MULTILINE)
455
+
456
+ # LaTeX table blocks
457
+ _VLM_LATEX_BLOCK = re.compile(
458
+ r"(\\begin\{tabular\}.*?\\end\{tabular\})", re.DOTALL
459
+ )
460
+
461
+
462
+ def _merge_vlm_with_tables(vlm_text: str, table_markdowns: list[str]) -> str:
463
+ """Replace VLM's table sections with TableFormer's more accurate tables.
464
+
465
+ Handles both markdown pipe tables and LaTeX tabular blocks in VLM output.
466
+ """
467
+ if not table_markdowns:
468
+ return vlm_text
469
+
470
+ # Find all table blocks (markdown first, then LaTeX)
471
+ md_tables = list(_VLM_TABLE_BLOCK.finditer(vlm_text))
472
+ latex_tables = list(_VLM_LATEX_BLOCK.finditer(vlm_text))
473
+
474
+ # Combine and sort all table positions
475
+ all_tables = [(m.start(), m.end(), "md") for m in md_tables]
476
+ all_tables += [(m.start(), m.end(), "latex") for m in latex_tables]
477
+ all_tables.sort(key=lambda x: x[0])
478
+
479
+ # Remove overlapping matches (prefer earlier match)
480
+ filtered: list[tuple[int, int, str]] = []
481
+ last_end = -1
482
+ for start, end, kind in all_tables:
483
+ if start >= last_end:
484
+ filtered.append((start, end, kind))
485
+ last_end = end
486
+
487
+ vlm_table_count = len(filtered)
488
+ tf_table_count = len(table_markdowns)
489
+
490
+ if vlm_table_count != tf_table_count:
491
+ logger.warning(
492
+ f"Table count mismatch: VLM={vlm_table_count}, TableFormer={tf_table_count}. "
493
+ f"Using positional replacement for min({vlm_table_count}, {tf_table_count}) tables."
494
+ )
495
+
496
+ # Replace VLM tables with TableFormer tables (positional)
497
+ result_parts: list[str] = []
498
+ prev_end = 0
499
+ table_idx = 0
500
+
501
+ for start, end, kind in filtered:
502
+ result_parts.append(vlm_text[prev_end:start])
503
+ if table_idx < tf_table_count:
504
+ result_parts.append(table_markdowns[table_idx].strip() + "\n")
505
+ table_idx += 1
506
+ else:
507
+ # More VLM tables than TableFormer — keep VLM version
508
+ result_parts.append(vlm_text[start:end])
509
+ prev_end = end
510
+
511
+ result_parts.append(vlm_text[prev_end:])
512
+
513
+ # If there are remaining TableFormer tables not matched, append them
514
+ while table_idx < tf_table_count:
515
+ result_parts.append("\n\n" + table_markdowns[table_idx].strip() + "\n")
516
+ table_idx += 1
517
+
518
+ return "".join(result_parts)
519
+
520
+
521
+ # ---------------------------------------------------------------------------
522
+ # PDF to Page Images (parallel, optimized)
523
+ # ---------------------------------------------------------------------------
524
+
525
+
526
+ def _render_single_page(
527
+ input_path: Path, page_idx: int, dpi: int
528
+ ) -> tuple[int, Optional[bytes]]:
529
+ """Render a single PDF page to PNG bytes with CLAHE preprocessing.
530
 
531
+ Returns (page_idx, png_bytes) or (page_idx, None) on failure.
532
+ """
533
+ try:
534
+ images = convert_from_path(
535
+ str(input_path), dpi=dpi, first_page=page_idx + 1, last_page=page_idx + 1
536
+ )
537
+ if not images:
538
+ return page_idx, None
539
+
540
+ img = images[0]
541
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
542
+ tmp_path = tmp.name
543
+ img.save(tmp_path, format="PNG")
544
+
545
+ try:
546
+ _preprocess_image_for_ocr(tmp_path)
547
+ with open(tmp_path, "rb") as f:
548
+ return page_idx, f.read()
549
+ finally:
550
+ os.unlink(tmp_path)
551
+ except Exception as e:
552
+ logger.warning(f"Failed to render page {page_idx + 1}: {e}")
553
+ return page_idx, None
554
+
555
+
556
+ def _pdf_to_page_images(
557
+ input_path: Path,
558
+ request_id: str,
559
+ start_page: int = 0,
560
+ end_page: Optional[int] = None,
561
+ ) -> list[tuple[int, bytes]]:
562
+ """Convert PDF pages to PNG image bytes using parallel rendering.
563
+
564
+ Uses ThreadPoolExecutor for concurrent page rendering.
565
+ Returns list of (page_no, png_bytes) tuples, sorted by page number.
566
+ """
567
  try:
 
568
  from pdf2image.pdf2image import pdfinfo_from_path
569
 
570
  info = pdfinfo_from_path(str(input_path))
571
  total_pages = info["Pages"]
572
  last_page = min(end_page + 1, total_pages) if end_page is not None else total_pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
573
  except Exception as e:
574
+ logger.warning(f"[{request_id}] Could not get PDF info: {e}")
575
+ return []
576
+
577
+ page_indices = list(range(start_page, last_page))
578
 
579
+ start_time = time.time()
580
+ page_images: list[tuple[int, bytes]] = []
581
+
582
+ # Render pages in parallel (4 threads — I/O bound, not CPU bound for poppler)
583
+ with ThreadPoolExecutor(max_workers=4) as executor:
584
+ futures = {
585
+ executor.submit(_render_single_page, input_path, idx, RENDER_DPI): idx
586
+ for idx in page_indices
587
+ }
588
+ for future in as_completed(futures):
589
+ page_idx, png_bytes = future.result()
590
+ if png_bytes is not None:
591
+ page_images.append((page_idx, png_bytes))
592
+
593
+ page_images.sort(key=lambda x: x[0])
594
+ render_time = time.time() - start_time
595
+ logger.info(
596
+ f"[{request_id}] Rendered {len(page_images)} pages in {render_time:.2f}s "
597
+ f"({render_time / max(len(page_images), 1):.1f}s/page, DPI={RENDER_DPI})"
598
+ )
599
  return page_images
600
 
601
 
602
  # ---------------------------------------------------------------------------
603
+ # Docling Converter (for TableFormer only)
604
  # ---------------------------------------------------------------------------
605
 
606
 
607
  def _create_converter(images_scale: float = 2.0) -> DocumentConverter:
608
  """Create a Docling converter with Standard Pipeline.
609
 
610
+ Used ONLY for TableFormer on table pages (not for full document OCR).
611
  """
612
  device = _get_device()
613
  logger.info(f"Creating converter with device: {device}")
 
618
  pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
619
  pipeline_options.table_structure_options.do_cell_matching = True
620
 
 
621
  pipeline_options.ocr_options = RapidOcrOptions()
622
  pipeline_options.ocr_options.force_full_page_ocr = True
623
 
 
624
  pipeline_options.generate_page_images = True
625
  pipeline_options.images_scale = images_scale
 
 
626
  pipeline_options.generate_picture_images = True
627
 
628
  pipeline_options.accelerator_options = AcceleratorOptions(
 
650
 
651
 
652
  # ---------------------------------------------------------------------------
653
+ # VLM-First Conversion (Pass 1: VLM, Pass 2: TableFormer, Merge)
654
  # ---------------------------------------------------------------------------
655
 
656
 
 
664
  end_page: Optional[int] = None,
665
  ) -> tuple:
666
  """
667
+ VLM-first hybrid conversion.
668
 
669
+ Pass 1 (GPU): VLM OCR on ALL pages (fast, concurrent)
670
+ Detect: Find table pages from VLM markdown output
671
+ Pass 2 (CPU): Docling TableFormer ONLY on table pages (mini-PDF)
672
+ Merge: VLM text for all pages + TableFormer tables
673
 
674
  Returns: (markdown_content, json_content, pages_processed, image_count)
675
  """
676
+ overall_start = time.time()
677
 
678
+ # ---- RENDER ALL PAGES ----
679
+ page_images = _pdf_to_page_images(input_path, request_id, start_page, end_page)
 
 
 
 
 
680
 
681
  if not page_images:
682
+ logger.warning(f"[{request_id}] No page images — falling back to full Docling pipeline")
 
 
683
  return _convert_document_full_docling(
684
  input_path, output_dir, images_scale, include_images, request_id
685
  )
686
 
687
+ render_time = time.time() - overall_start
688
+
689
+ # ---- PASS 1: VLM OCR ALL PAGES (GPU, concurrent) ----
690
  logger.info(f"[{request_id}] Pass 1: VLM OCR via Qwen3-VL ({VLM_MODEL})")
691
+ logger.info(f"[{request_id}] Sending {len(page_images)} pages to VLM ({VLM_CONCURRENCY} concurrent)")
692
 
693
  vlm_page_texts: dict[int, Optional[str]] = {}
694
  vlm_start = time.time()
695
 
696
+ with ThreadPoolExecutor(max_workers=VLM_CONCURRENCY) as executor:
697
+ future_to_page = {
698
+ executor.submit(_vlm_ocr_page, page_bytes, request_id, page_no + 1): page_no
 
 
 
 
 
699
  for page_no, page_bytes in page_images
700
  }
701
+ for future in as_completed(future_to_page):
702
+ page_no = future_to_page[future]
703
  try:
704
  vlm_text = future.result()
705
  vlm_page_texts[page_no] = vlm_text
 
707
  f"[{request_id}] VLM processed page {page_no + 1} ({len(vlm_text)} chars)"
708
  )
709
  except Exception as e:
710
+ logger.warning(f"[{request_id}] VLM failed on page {page_no + 1}: {e}")
 
 
711
  vlm_page_texts[page_no] = None
712
 
713
  vlm_time = time.time() - vlm_start
714
+ logger.info(f"[{request_id}] Pass 1 completed in {vlm_time:.2f}s ({len(vlm_page_texts)} pages)")
 
 
715
 
716
+ # ---- DETECT TABLE PAGES ----
717
  table_pages = _detect_table_pages(vlm_page_texts)
718
+
719
  if table_pages:
720
  logger.info(
721
  f"[{request_id}] Tables detected on {len(table_pages)} pages: "
 
724
  else:
725
  logger.info(f"[{request_id}] No tables detected — skipping Docling entirely")
726
 
727
+ # ---- PASS 2: DOCLING TABLEFORMER ON TABLE PAGES ONLY ----
728
  tables_by_page: dict[int, list[str]] = {}
729
+ tableformer_time = 0.0
 
 
730
 
731
  if table_pages:
 
732
  logger.info(
733
  f"[{request_id}] Pass 2: Docling TableFormer on {len(table_pages)} table pages"
734
  )
735
+ tf_start = time.time()
736
 
737
  try:
 
738
  mini_pdf_path, page_map = _extract_pages_to_pdf(
739
  input_path, sorted(table_pages), request_id
740
  )
741
 
 
742
  converter = _get_converter()
743
  result = converter.convert(mini_pdf_path)
744
  doc = result.document
745
 
746
+ if doc is not None:
747
+ tables_by_page = _extract_table_markdowns(doc, page_map)
748
+ total_tables = sum(len(v) for v in tables_by_page.values())
749
+ logger.info(
750
+ f"[{request_id}] Pass 2 completed in {time.time() - tf_start:.2f}s "
751
+ f"{total_tables} TableFormer tables extracted"
752
+ )
753
+ else:
754
+ logger.warning(f"[{request_id}] Docling returned None document for table pages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
 
756
  # Clean up mini-PDF
757
+ mini_pdf_path.unlink(missing_ok=True)
 
 
 
 
 
 
 
 
 
 
758
 
759
  except Exception as e:
760
+ logger.error(f"[{request_id}] TableFormer pass failed: {e}")
761
+
762
+ tableformer_time = time.time() - tf_start
 
763
 
764
+ # ---- MERGE: VLM TEXT + TABLEFORMER TABLES ----
765
  md_parts: list[str] = []
766
+ image_count = 0
767
 
768
  for page_no in sorted(vlm_page_texts.keys()):
 
769
  md_parts.append(f"\n\n<!-- Page {page_no + 1} -->\n\n")
770
 
771
  vlm_text = vlm_page_texts[page_no]
772
 
773
  if vlm_text is None:
774
+ # VLM failed note the gap
775
+ md_parts.append(f"[Page {page_no + 1}: VLM extraction failed]\n\n")
776
  else:
777
+ # Merge VLM text with TableFormer tables for this page (if any)
778
  page_tables = tables_by_page.get(page_no, [])
779
  if page_tables:
780
  merged = _merge_vlm_with_tables(vlm_text, page_tables)
 
782
  else:
783
  md_parts.append(vlm_text)
784
 
785
+ # ---- IMAGES (from Docling if requested and tables were processed) ----
786
+ if include_images and table_pages:
787
+ image_dir = output_dir / "images"
788
+ image_dir.mkdir(parents=True, exist_ok=True)
789
+ try:
790
+ converter = _get_converter()
791
+ result = converter.convert(input_path)
792
+ doc = result.document
793
+ if doc:
794
+ for element, _ in doc.iterate_items():
795
+ if isinstance(element, PictureItem):
796
+ if element.image and element.image.pil_image:
797
+ pg = element.prov[0].page_no if element.prov else 0
798
+ image_id = element.self_ref.split("/")[-1]
799
+ image_name = f"page_{pg}_{image_id}.png"
800
+ image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
801
+ image_path = image_dir / image_name
802
+ try:
803
+ element.image.pil_image.save(image_path, format="PNG")
804
+ image_count += 1
805
+ except Exception as e:
806
+ logger.warning(f"[{request_id}] Failed to save image: {e}")
807
+ except Exception as e:
808
+ logger.warning(f"[{request_id}] Image extraction failed: {e}")
809
+
810
  markdown_content = "".join(md_parts)
811
+ pages_processed = len(vlm_page_texts)
812
+ total_time = time.time() - overall_start
813
 
 
814
  logger.info(
815
  f"[{request_id}] VLM-first conversion complete: {pages_processed} pages — "
816
  f"render {render_time:.1f}s + VLM {vlm_time:.1f}s + "
817
+ f"TableFormer {tableformer_time:.1f}s = {total_time:.2f}s total"
818
  )
819
  if pages_processed > 0:
820
  logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
 
829
  include_images: bool,
830
  request_id: str,
831
  ) -> tuple:
832
+ """Fallback: full Docling pipeline when page images are unavailable."""
833
+ logger.info(f"[{request_id}] Fallback: running full Docling pipeline")
 
834
  converter = _get_converter()
835
+
836
  start_time = time.time()
837
  result = converter.convert(input_path)
838
  doc = result.document
 
839
  if doc is None:
840
  raise ValueError("Docling failed to parse document")
841
 
842
  elapsed = time.time() - start_time
843
+ logger.info(f"[{request_id}] Full Docling pipeline completed in {elapsed:.2f}s")
844
 
845
  markdown_content = doc.export_to_markdown()
846
  pages_processed = len(
 
854
  for element, _ in doc.iterate_items():
855
  if isinstance(element, PictureItem):
856
  if element.image and element.image.pil_image:
857
+ pg = element.prov[0].page_no if element.prov else 0
858
  image_id = element.self_ref.split("/")[-1]
859
+ image_name = f"page_{pg}_{image_id}.png"
860
  image_name = re.sub(r'[\\/*?:"<>|]', "", image_name)
861
  image_path = image_dir / image_name
862
  try:
 
907
  async def lifespan(app: FastAPI):
908
  """Startup: initialize Docling converter and check vLLM."""
909
  logger.info("=" * 60)
910
+ logger.info("Starting Docling VLM Parser API v3.1.0...")
911
 
912
  device = _get_device()
913
  logger.info(f"Device: {device}")
 
916
  logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
917
  logger.info(f"CUDA Version: {torch.version.cuda}")
918
  logger.info(
919
+ f"GPU Memory: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB"
920
  )
921
 
922
  logger.info(f"VLM Model: {VLM_MODEL}")
923
  logger.info(f"VLM Endpoint: http://{VLM_HOST}:{VLM_PORT}")
924
+ logger.info(f"VLM Timeout: {VLM_TIMEOUT}s, Concurrency: {VLM_CONCURRENCY}")
925
+ logger.info(f"Render DPI: {RENDER_DPI}")
926
  logger.info(f"Images scale: {IMAGES_SCALE}")
927
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
928
 
 
958
 
959
  app = FastAPI(
960
  title="Docling VLM Parser API",
961
+ description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
962
+ version="3.1.0",
963
  lifespan=lifespan,
964
  )
965
 
 
973
  async def health_check() -> HealthResponse:
974
  """Health check endpoint."""
975
  device = _get_device()
 
 
 
976
 
 
977
  vlm_status = "unknown"
978
  try:
979
  async with httpx.AsyncClient(timeout=5) as client:
 
984
 
985
  return HealthResponse(
986
  status="healthy",
987
+ version="3.1.0",
988
  device=device,
989
+ gpu_name=None,
990
+ vlm_model="active",
991
  vlm_status=vlm_status,
992
  images_scale=IMAGES_SCALE,
993
  )
 
997
  async def parse_document(
998
  file: UploadFile = File(..., description="PDF or image file to parse"),
999
  output_format: str = Form(default="markdown", description="Output format: markdown or json"),
1000
+ images_scale: Optional[float] = Form(default=None, description="Image resolution scale"),
1001
  start_page: int = Form(default=0, description="Starting page (0-indexed)"),
1002
  end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"),
1003
+ include_images: bool = Form(default=False, description="Include extracted images"),
1004
  _token: str = Depends(verify_token),
1005
  ) -> ParseResponse:
1006
+ """Parse a document file using VLM-first hybrid pipeline."""
 
 
 
 
 
 
 
 
 
 
 
 
1007
  request_id = str(uuid4())[:8]
1008
  start_time = time.time()
1009
 
 
1016
  if output_format not in ("markdown",):
1017
  raise HTTPException(
1018
  status_code=400,
1019
+ detail="Only 'markdown' output_format is supported",
1020
  )
1021
 
1022
  # Validate file size
 
1028
  logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
1029
 
1030
  if file_size > MAX_FILE_SIZE_BYTES:
 
1031
  raise HTTPException(
1032
  status_code=413,
1033
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
 
1037
  allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
1038
  file_ext = Path(file.filename).suffix.lower() if file.filename else ""
1039
  if file_ext not in allowed_extensions:
 
1040
  raise HTTPException(
1041
  status_code=400,
1042
  detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
1043
  )
1044
 
 
1045
  use_images_scale = images_scale if images_scale is not None else IMAGES_SCALE
1046
 
1047
  logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
1048
  logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
1049
 
1050
  temp_dir = tempfile.mkdtemp()
 
1051
 
1052
  try:
 
1053
  input_path = Path(temp_dir) / f"input{file_ext}"
1054
  await asyncio.to_thread(_save_uploaded_file, input_path, file.file)
 
1055
 
 
1056
  output_dir = Path(temp_dir) / "output"
1057
  output_dir.mkdir(exist_ok=True)
1058
 
 
1059
  markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
1060
  _convert_document,
1061
  input_path,
 
1067
  end_page,
1068
  )
1069
 
 
1070
  images_zip = None
1071
  if include_images and image_count > 0:
1072
  images_zip, image_count = _create_images_zip(output_dir)
 
1073
 
1074
  total_duration = time.time() - start_time
1075
  logger.info(f"[{request_id}] {'='*50}")
 
1103
  )
1104
  finally:
1105
  shutil.rmtree(temp_dir, ignore_errors=True)
 
1106
 
1107
 
1108
  @app.post("/parse/url", response_model=ParseResponse)
 
1110
  request: URLParseRequest,
1111
  _token: str = Depends(verify_token),
1112
  ) -> ParseResponse:
1113
+ """Parse a document from a URL using VLM-first hybrid pipeline."""
 
 
 
 
 
 
 
1114
  request_id = str(uuid4())[:8]
1115
  start_time = time.time()
1116
 
 
1122
  if request.output_format not in ("markdown",):
1123
  raise HTTPException(
1124
  status_code=400,
1125
+ detail="Only 'markdown' output_format is supported",
1126
  )
1127
 
 
 
1128
  _validate_url(request.url)
 
1129
 
1130
  temp_dir = tempfile.mkdtemp()
 
1131
 
1132
  try:
1133
  # Download file
 
1136
  async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
1137
  response = await client.get(request.url)
1138
  response.raise_for_status()
 
1139
 
1140
  file_size_mb = len(response.content) / (1024 * 1024)
1141
+ logger.info(
1142
+ f"[{request_id}] Download completed in {time.time() - download_start:.2f}s "
1143
+ f"({file_size_mb:.2f} MB)"
1144
+ )
1145
 
1146
+ # Determine file extension (with Content-Type fallback)
 
1147
  url_path = Path(request.url.split("?")[0])
1148
  file_ext = url_path.suffix.lower()
1149
 
1150
+ if not file_ext or file_ext not in {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
 
1151
  content_type = response.headers.get("content-type", "").lower()
1152
  ct_map = {
1153
  "application/pdf": ".pdf",
 
1157
  "image/bmp": ".bmp",
1158
  }
1159
  file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
 
1160
 
1161
  if len(response.content) > MAX_FILE_SIZE_BYTES:
 
 
 
1162
  raise HTTPException(
1163
  status_code=413,
1164
  detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
1165
  )
1166
 
 
1167
  input_path = Path(temp_dir) / f"input{file_ext}"
1168
  await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
 
1169
 
 
1170
  output_dir = Path(temp_dir) / "output"
1171
  output_dir.mkdir(exist_ok=True)
1172
 
 
1173
  use_images_scale = request.images_scale if request.images_scale is not None else IMAGES_SCALE
1174
 
1175
  logger.info(f"[{request_id}] Images scale: {use_images_scale}, VLM: {VLM_MODEL}")
1176
+ logger.info(
1177
+ f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
1178
+ )
1179
 
 
1180
  markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
1181
  _convert_document,
1182
  input_path,
 
1188
  request.end_page,
1189
  )
1190
 
 
1191
  images_zip = None
1192
  if request.include_images and image_count > 0:
1193
  images_zip, image_count = _create_images_zip(output_dir)
 
1194
 
1195
  total_duration = time.time() - start_time
1196
  logger.info(f"[{request_id}] {'='*50}")
 
1231
  )
1232
  finally:
1233
  shutil.rmtree(temp_dir, ignore_errors=True)
 
1234
 
1235
 
1236
  if __name__ == "__main__":