Spaces:

outcomelabs
/

docling-parser

Running on T4

App Files Files Community

Ibad ur Rehman commited on 29 days ago

Commit

c28aa68

1 Parent(s): b5db7b1

fix: update docling gemini parser

Browse files

Files changed (3) hide show

app.py +47 -21
pipeline.py +469 -33
requirements.txt +2 -2

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 """
-Docling OCR Parser API v7.1.0
-A FastAPI service using Docling's standard PDF pipeline with EasyOCR
-for PDF parsing.
 """
 import asyncio
@@ -20,11 +23,18 @@ from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
 from auth import _validate_url, verify_token
 from config import (
     DOCLING_DEVICE,
     DOCLING_NUM_THREADS,
     IMAGES_SCALE,
     MAX_FILE_SIZE_BYTES,
     MAX_FILE_SIZE_MB,
     logger,
 )
 from models import HealthResponse, ParseResponse, URLParseRequest
@@ -41,27 +51,34 @@ from pipeline import (
 async def lifespan(app: FastAPI):
     """Startup: initialize Docling converter."""
     logger.info("=" * 60)
-    logger.info("Starting Docling OCR Parser API v7.1.0...")
-    logger.info("Initializing Docling EasyOCR converter...")
     _get_converter()
-    logger.info("Docling EasyOCR converter ready")
     logger.info(f"Images scale: {IMAGES_SCALE}")
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Docling device: {DOCLING_DEVICE}")
     logger.info(f"Docling threads: {DOCLING_NUM_THREADS}")
     logger.info("=" * 60)
-    logger.info("Docling OCR Parser API ready (Docling + EasyOCR)")
     logger.info("=" * 60)
     yield
     logger.info("Shutting down Docling VLM Parser API...")
 app = FastAPI(
-    title="Docling OCR Parser API",
-    description="Docling parser with EasyOCR",
-    version="7.1.0",
     lifespan=lifespan,
 )
@@ -71,9 +88,9 @@ async def health_check() -> HealthResponse:
     """Health check endpoint."""
     return HealthResponse(
         status="healthy",
-        version="7.1.0",
-        model="Docling + EasyOCR",
-        gemini_status="disabled",
         images_scale=IMAGES_SCALE,
     )
@@ -88,7 +105,7 @@ async def parse_document(
     include_images: bool = Form(default=False, description="Include extracted images"),
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
-    """Parse a document file using Docling with EasyOCR."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
@@ -159,9 +176,9 @@ async def parse_document(
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
-            device_used=DOCLING_DEVICE,
-            vlm_model="Docling + EasyOCR",
-            gemini_page_count=0,
             gemini_pages=gemini_pages,
         )
     except Exception as e:
@@ -177,7 +194,7 @@ async def parse_document_from_url(
     request: URLParseRequest,
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
-    """Parse a document from URL using Docling with EasyOCR."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
@@ -245,14 +262,23 @@ async def parse_document_from_url(
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
-            device_used=DOCLING_DEVICE,
-            vlm_model="Docling + EasyOCR",
-            gemini_page_count=0,
             gemini_pages=gemini_pages,
         )
     except Exception as e:
         total_duration = time.time() - start_time
         logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
         return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
     finally:
         shutil.rmtree(temp_dir, ignore_errors=True)

 """
+Docling VLM Parser API v6.0.0
+A FastAPI service using a Docling-first + Gemini hybrid architecture for
+document parsing:
+  Pass 1:       Docling on a PDF slice or full input (no OCR)
+  Pass 2 (API): Gemini on table pages and weak-text pages
+  Post:         Cross-page artifact removal, table cleanup, deduplication
 """
 import asyncio
 from auth import _validate_url, verify_token
 from config import (
+    BITMAP_AREA_THRESHOLD,
     DOCLING_DEVICE,
     DOCLING_NUM_THREADS,
+    GEMINI_API_KEY,
+    GEMINI_CONCURRENCY,
+    GEMINI_MODEL,
+    IMAGE_DOMINANT_THRESHOLD,
     IMAGES_SCALE,
     MAX_FILE_SIZE_BYTES,
     MAX_FILE_SIZE_MB,
+    RENDER_DPI,
+    SPARSE_TEXT_THRESHOLD,
     logger,
 )
 from models import HealthResponse, ParseResponse, URLParseRequest
 async def lifespan(app: FastAPI):
     """Startup: initialize Docling converter."""
     logger.info("=" * 60)
+    logger.info("Starting Docling VLM Parser API v6.0.0...")
+    logger.info("Initializing Docling converter...")
     _get_converter()
+    logger.info("Docling converter ready")
+    logger.info(f"Render DPI: {RENDER_DPI}")
     logger.info(f"Images scale: {IMAGES_SCALE}")
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Docling device: {DOCLING_DEVICE}")
     logger.info(f"Docling threads: {DOCLING_NUM_THREADS}")
+    logger.info(f"Bitmap area threshold: {BITMAP_AREA_THRESHOLD}")
+    logger.info(f"Sparse text threshold: {SPARSE_TEXT_THRESHOLD}")
+    logger.info(f"Image dominant threshold: {IMAGE_DOMINANT_THRESHOLD}")
+    logger.info(f"Gemini Model: {GEMINI_MODEL}")
+    logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
+    logger.info(f"Gemini Concurrency: {GEMINI_CONCURRENCY}")
     logger.info("=" * 60)
+    logger.info("Docling VLM Parser API ready (Docling-first + Gemini hybrid)")
     logger.info("=" * 60)
     yield
     logger.info("Shutting down Docling VLM Parser API...")
 app = FastAPI(
+    title="Docling VLM Parser API",
+    description="Docling-first + Gemini hybrid parser",
+    version="6.0.0",
     lifespan=lifespan,
 )
     """Health check endpoint."""
     return HealthResponse(
         status="healthy",
+        version="6.0.0",
+        model="Docling + Gemini",
+        gemini_status="configured" if GEMINI_API_KEY else "not set",
         images_scale=IMAGES_SCALE,
     )
     include_images: bool = Form(default=False, description="Include extracted images"),
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
+    """Parse a document file using the hybrid parser."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
+            device_used="cpu",
+            vlm_model="Docling + Gemini",
+            gemini_page_count=len(gemini_pages),
             gemini_pages=gemini_pages,
         )
     except Exception as e:
     request: URLParseRequest,
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
+    """Parse a document from URL using the hybrid parser."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
+            device_used="cpu",
+            vlm_model="Docling + Gemini",
+            gemini_page_count=len(gemini_pages),
             gemini_pages=gemini_pages,
         )
+    except httpx.HTTPError as e:
+        logger.error(f"[{request_id}] Download failed: {e}")
+        return ParseResponse(success=False, error=f"Failed to download file from URL (ref: {request_id})")
     except Exception as e:
         total_duration = time.time() - start_time
         logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
         return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
     finally:
         shutil.rmtree(temp_dir, ignore_errors=True)
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Docling EasyOCR pipeline and file helpers."""
 import base64
 import io
@@ -6,39 +6,64 @@ import re
 import shutil
 import time
 import zipfile
 from pathlib import Path
 from typing import BinaryIO, Optional
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import PictureItem
 from docling.datamodel.pipeline_options import (
     AcceleratorOptions,
-    EasyOcrOptions,
     PdfPipelineOptions,
     TableFormerMode,
-    TableStructureOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from pypdf import PdfReader, PdfWriter
-from config import DOCLING_DEVICE, DOCLING_NUM_THREADS, IMAGES_SCALE, logger
 _converter = None
 _INLINE_DATA_IMAGE = re.compile(r"!\[[^\]]*\]\(data:image/[^)]+\)", re.IGNORECASE)
 def _get_converter():
-    """Get or create the global Docling converter instance with EasyOCR."""
     global _converter
     if _converter is None:
         pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_ocr = True
-        pipeline_options.ocr_options = EasyOcrOptions(lang=["en"])
         pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options = TableStructureOptions(
-            do_cell_matching=True,
-            mode=TableFormerMode.ACCURATE,
-        )
         pipeline_options.images_scale = IMAGES_SCALE
         pipeline_options.accelerator_options = AcceleratorOptions(
             device=DOCLING_DEVICE,
@@ -49,6 +74,7 @@ def _get_converter():
             format_options={
                 InputFormat.PDF: PdfFormatOption(
                     pipeline_options=pipeline_options,
                 )
             }
         )
@@ -60,13 +86,11 @@ def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
         shutil.copyfileobj(file_obj, f)
 def _save_downloaded_content(input_path: Path, content: bytes) -> None:
     with open(input_path, "wb") as f:
         f.write(content)
 def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
     """Create a zip file from extracted images."""
     image_dir = output_dir / "images"
@@ -90,13 +114,62 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
     return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
 def _resolve_pdf_page_count(input_path: Path) -> int:
     from pdf2image.pdf2image import pdfinfo_from_path
     return int(pdfinfo_from_path(str(input_path))["Pages"])
 def _prepare_input_slice(
     input_path: Path,
@@ -136,6 +209,265 @@ def _prepare_input_slice(
     return slice_path, start_page, last_page, requested_pages
 def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
     """Save Docling picture images to output dir."""
@@ -160,14 +492,6 @@ def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
     return image_count
-def _clean_markdown(markdown: str) -> str:
-    markdown = _INLINE_DATA_IMAGE.sub("", markdown)
-    markdown = re.sub(r"\n{3,}", "\n\n", markdown)
-    return markdown.strip()
 def _convert_document(
     input_path: Path,
     output_dir: Path,
@@ -176,34 +500,146 @@ def _convert_document(
     start_page: int = 0,
     end_page: Optional[int] = None,
 ) -> tuple:
-    """Convert a PDF slice using Docling with EasyOCR."""
     overall_start = time.time()
     working_input, page_offset, resolved_end_page, requested_pages = _prepare_input_slice(
         input_path, output_dir, request_id, start_page, end_page
     )
     converter = _get_converter()
-    convert_start = time.time()
     result = converter.convert(working_input)
     doc = result.document
     if doc is None:
         raise ValueError(
             f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
         )
-    convert_time = time.time() - convert_start
-    markdown_content = _clean_markdown(doc.export_to_markdown())
     image_count = 0
     if include_images:
         image_count = _save_docling_images(doc, output_dir, request_id)
-    total_duration = time.time() - overall_start
     logger.info(
-        f"[{request_id}] Docling EasyOCR conversion complete: {requested_pages} pages; "
-        f"Docling {convert_time:.2f}s total"
     )
-    if total_duration > 0:
-        logger.info(f"[{request_id}] Speed: {requested_pages / total_duration:.2f} pages/sec")
-    return markdown_content, None, requested_pages, image_count, []

+"""Docling-first pipeline, Gemini routing, and file helpers."""
 import base64
 import io
 import shutil
 import time
 import zipfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import BinaryIO, Optional
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import PictureItem, TableItem
 from docling.datamodel.pipeline_options import (
     AcceleratorOptions,
     PdfPipelineOptions,
     TableFormerMode,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from pypdf import PdfReader, PdfWriter
+from config import (
+    BITMAP_AREA_THRESHOLD,
+    DOCLING_DEVICE,
+    DOCLING_NUM_THREADS,
+    GEMINI_API_KEY,
+    GEMINI_CONCURRENCY,
+    GEMINI_MODEL,
+    IMAGE_DOMINANT_THRESHOLD,
+    IMAGES_SCALE,
+    SPARSE_TEXT_THRESHOLD,
+    logger,
+)
+from gemini import _gemini_extract_page
+from postprocess import _post_process_merged_markdown
+from rendering import _pdf_to_page_images
 _converter = None
+_PAGE_MARKER = re.compile(r"^\s*---\s*Page\s+\d+\s*---\s*$", re.MULTILINE)
 _INLINE_DATA_IMAGE = re.compile(r"!\[[^\]]*\]\(data:image/[^)]+\)", re.IGNORECASE)
+_ABOUT_BLANK_LINE = re.compile(r"^\s*about:blank(?:\s+\d+/\d+)?\s*$", re.IGNORECASE)
+_BROWSER_PRINT_LINE = re.compile(
+    r"^\s*\d{1,2}/\d{1,2}/\d{2,4},\s+\d{1,2}:\d{2}\s*(?:AM|PM)\b.*$",
+    re.IGNORECASE,
+)
+_PAGE_COUNTER_LINE = re.compile(r"^\s*\d+\s*/\s*\d+\s*$")
+_URLISH_LINE = re.compile(r"^\s*(?:https?://|www\.)\S+\s*$", re.IGNORECASE)
+_SHORT_ARTIFACT_LINE = re.compile(
+    r"^\s*(?:printed from|generated by|page \d+ of \d+|page \d+)\s*$",
+    re.IGNORECASE,
+)
 def _get_converter():
+    """Get or create the global Docling converter instance."""
     global _converter
     if _converter is None:
         pipeline_options = PdfPipelineOptions()
+        pipeline_options.do_ocr = False
         pipeline_options.do_table_structure = True
+        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        pipeline_options.table_structure_options.do_cell_matching = True
+        pipeline_options.generate_page_images = False
+        pipeline_options.generate_picture_images = True
         pipeline_options.images_scale = IMAGES_SCALE
         pipeline_options.accelerator_options = AcceleratorOptions(
             device=DOCLING_DEVICE,
             format_options={
                 InputFormat.PDF: PdfFormatOption(
                     pipeline_options=pipeline_options,
+                    backend=DoclingParseDocumentBackend,
                 )
             }
         )
         shutil.copyfileobj(file_obj, f)
 def _save_downloaded_content(input_path: Path, content: bytes) -> None:
     with open(input_path, "wb") as f:
         f.write(content)
 def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
     """Create a zip file from extracted images."""
     image_dir = output_dir / "images"
     return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
 def _resolve_pdf_page_count(input_path: Path) -> int:
     from pdf2image.pdf2image import pdfinfo_from_path
     return int(pdfinfo_from_path(str(input_path))["Pages"])
+def _extract_pdf_text_by_page(input_path: Path) -> dict[int, str]:
+    """Extract native PDF text per page using pypdf as a routing fallback."""
+    if input_path.suffix.lower() != ".pdf":
+        return {}
+    text_by_page: dict[int, str] = {}
+    try:
+        reader = PdfReader(str(input_path))
+        for page_no, page in enumerate(reader.pages):
+            try:
+                text_by_page[page_no] = (page.extract_text() or "").strip()
+            except Exception:
+                text_by_page[page_no] = ""
+    except Exception:
+        return {}
+    return text_by_page
+def _extract_pdf_page_signals(input_path: Path) -> dict[int, dict[str, bool]]:
+    """Inspect PDF structure per page for native text and image-backed pages."""
+    if input_path.suffix.lower() != ".pdf":
+        return {}
+    page_signals: dict[int, dict[str, bool]] = {}
+    try:
+        reader = PdfReader(str(input_path))
+        for page_no, page in enumerate(reader.pages):
+            has_fonts = False
+            image_count = 0
+            try:
+                resources = page.get("/Resources") or {}
+                font_resources = resources.get("/Font")
+                has_fonts = bool(font_resources)
+                xobjects = resources.get("/XObject") or {}
+                for xobj in xobjects.values():
+                    subtype = xobj.get("/Subtype")
+                    if subtype == "/Image":
+                        image_count += 1
+            except Exception:
+                pass
+            page_signals[page_no] = {
+                "has_fonts": has_fonts,
+                "has_images": image_count > 0,
+                "image_count": image_count,
+                "image_only_pdf_page": image_count > 0 and not has_fonts,
+            }
+    except Exception:
+        return {}
+    return page_signals
 def _prepare_input_slice(
     input_path: Path,
     return slice_path, start_page, last_page, requested_pages
+def _bitmap_coverage(page) -> float:
+    """Compute bitmap coverage ratio for a Docling page."""
+    try:
+        if page._backend is None or page.size is None:
+            return 0.0
+        bitmap_rects = page._backend.get_bitmap_rects()
+        if not bitmap_rects:
+            return 0.0
+        page_area = page.size.width * page.size.height
+        if page_area <= 0:
+            return 0.0
+        bitmap_area = sum(max(0.0, rect.area()) for rect in bitmap_rects)
+        return min(1.0, bitmap_area / page_area)
+    except Exception:
+        return 0.0
+def _page_has_native_text(page) -> bool:
+    """Check whether Docling extracted meaningful native text on a page."""
+    try:
+        return any(
+            getattr(cell, "text", "").strip() and not getattr(cell, "from_ocr", False)
+            for cell in page.cells
+        )
+    except Exception:
+        return False
+def _artifact_line_count(text: str) -> int:
+    count = 0
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if (
+            _ABOUT_BLANK_LINE.match(stripped)
+            or _BROWSER_PRINT_LINE.match(stripped)
+            or _PAGE_COUNTER_LINE.match(stripped)
+            or _URLISH_LINE.match(stripped)
+            or _SHORT_ARTIFACT_LINE.match(stripped)
+        ):
+            count += 1
+    return count
+def _clean_extracted_text(text: str) -> str:
+    cleaned_lines: list[str] = []
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if (
+            _ABOUT_BLANK_LINE.match(stripped)
+            or _BROWSER_PRINT_LINE.match(stripped)
+            or _PAGE_COUNTER_LINE.match(stripped)
+            or _URLISH_LINE.match(stripped)
+            or _SHORT_ARTIFACT_LINE.match(stripped)
+        ):
+            continue
+        cleaned_lines.append(stripped)
+    return "\n".join(cleaned_lines).strip()
+def _repetition_ratio(text: str) -> float:
+    lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
+    if not lines:
+        return 0.0
+    unique = len(set(lines))
+    return 1.0 - (unique / len(lines))
+def _looks_meaningful_text(cleaned_text: str) -> bool:
+    compact = re.sub(r"\s+", "", cleaned_text)
+    words = re.findall(r"[A-Za-z][A-Za-z0-9'&/-]*", cleaned_text)
+    return len(compact) >= 120 or len(set(w.lower() for w in words)) >= 20
+def _build_page_markdown(doc, page_no: int, elements_by_page: dict[int, list]) -> str:
+    """Build markdown for one page from Docling items grouped by provenance."""
+    parts: list[str] = []
+    for element in elements_by_page.get(page_no, []):
+        if isinstance(element, PictureItem):
+            continue
+        try:
+            parts.append(element.export_to_markdown(doc=doc))
+        except Exception:
+            text = getattr(element, "text", "").strip()
+            if text:
+                parts.append(text)
+    page_md = "\n\n".join(p for p in parts if p and p.strip())
+    page_md = _INLINE_DATA_IMAGE.sub("", page_md)
+    return _PAGE_MARKER.sub("", page_md).strip()
+def _extract_page_markdowns(doc, page_count: int, elements_by_page: dict[int, list]) -> dict[int, str]:
+    """Build per-page markdown strictly from Docling provenance."""
+    return {
+        page_no: _build_page_markdown(doc, page_no, elements_by_page)
+        for page_no in range(page_count)
+    }
+def _normalize_docling_page_no(page) -> int:
+    """Normalize Docling page numbering to a zero-based index."""
+    raw_page_no = int(page.page_no)
+    return raw_page_no - 1 if raw_page_no > 0 else raw_page_no
+def _table_pages(doc) -> set[int]:
+    """Collect Docling-detected table page numbers."""
+    pages: set[int] = set()
+    for element, _ in doc.iterate_items():
+        if isinstance(element, TableItem) and element.prov:
+            prov_page_no = int(element.prov[0].page_no)
+            pages.add(prov_page_no - 1 if prov_page_no > 0 else prov_page_no)
+    return pages
+def _picture_pages(doc) -> set[int]:
+    """Collect pages containing Docling picture items."""
+    pages: set[int] = set()
+    for element, _ in doc.iterate_items():
+        if isinstance(element, PictureItem) and element.prov:
+            prov_page_no = int(element.prov[0].page_no)
+            pages.add(prov_page_no - 1 if prov_page_no > 0 else prov_page_no)
+    return pages
+def _build_elements_by_page(doc) -> dict[int, list]:
+    """Index Docling elements by page number."""
+    elements_by_page: dict[int, list] = {}
+    for element, _ in doc.iterate_items():
+        if element.prov:
+            prov_page_no = int(element.prov[0].page_no)
+            page_no = prov_page_no - 1 if prov_page_no > 0 else prov_page_no
+            elements_by_page.setdefault(page_no, []).append(element)
+    return elements_by_page
+def _routing_decision(
+    page_no: int,
+    page_markdown: str,
+    pdf_text: str,
+    page_pdf_signals: dict[str, bool],
+    page,
+    table_pages: set[int],
+    picture_pages: set[int],
+) -> tuple[str, list[str], dict]:
+    """Decide whether a page should use Docling or Gemini."""
+    bitmap_coverage = _bitmap_coverage(page)
+    image_only_pdf_page = page_pdf_signals.get("image_only_pdf_page", False)
+    image_count = int(page_pdf_signals.get("image_count", 0) or 0)
+    has_images = bool(page_pdf_signals.get("has_images", False))
+    effective_bitmap_coverage = max(bitmap_coverage, 1.0 if image_only_pdf_page else 0.0)
+    image_dominant = bitmap_coverage >= IMAGE_DOMINANT_THRESHOLD
+    pdf_text_len = len(re.sub(r"\s+", "", pdf_text))
+    docling_native_text = _page_has_native_text(page)
+    cleaned_docling_text = _clean_extracted_text(page_markdown)
+    cleaned_pdf_text = _clean_extracted_text(pdf_text)
+    cleaned_docling_text_len = len(re.sub(r"\s+", "", cleaned_docling_text))
+    cleaned_pdf_text_len = len(re.sub(r"\s+", "", cleaned_pdf_text))
+    raw_routing_text = page_markdown if page_markdown.strip() else pdf_text
+    routing_text = raw_routing_text
+    artifact_lines = _artifact_line_count(routing_text)
+    nonempty_lines = max(1, len([line for line in routing_text.splitlines() if line.strip()]))
+    artifact_line_ratio = artifact_lines / nonempty_lines
+    repetition_ratio = _repetition_ratio(routing_text)
+    meaningful_text_present = (
+        _looks_meaningful_text(cleaned_docling_text)
+        or _looks_meaningful_text(cleaned_pdf_text)
+    )
+    native_text_present = docling_native_text or meaningful_text_present
+    docling_text_len = len(re.sub(r"\s+", "", routing_text))
+    page_empty = docling_text_len == 0
+    table_like_page = _looks_table_like(routing_text)
+    picture_page = page_no in picture_pages
+    junk_text_heavy = artifact_line_ratio >= 0.5 or repetition_ratio >= 0.45
+    image_heavy_page = (
+        image_only_pdf_page
+        or image_dominant
+        or effective_bitmap_coverage > BITMAP_AREA_THRESHOLD
+        or image_count >= 3
+        or (has_images and not meaningful_text_present)
+    )
+    low_quality_docling_output = (
+        page_empty
+        or junk_text_heavy
+        or (docling_text_len < SPARSE_TEXT_THRESHOLD and has_images and not meaningful_text_present)
+    )
+    reasons: list[str] = []
+    if page_no in table_pages:
+        reasons.append("table_page")
+    if table_like_page:
+        reasons.append("table_like_page")
+    if picture_page and not meaningful_text_present:
+        reasons.append("picture_without_native_text")
+    if page_empty:
+        reasons.append("docling_empty")
+    if image_heavy_page and not meaningful_text_present:
+        reasons.append("image_heavy_weak_text")
+    if junk_text_heavy and has_images:
+        reasons.append("junk_text_with_images")
+    if low_quality_docling_output and has_images:
+        reasons.append("low_quality_docling_output")
+    route = "gemini" if reasons else "docling"
+    metrics = {
+        "bitmap_coverage": round(bitmap_coverage, 4),
+        "effective_bitmap_coverage": round(effective_bitmap_coverage, 4),
+        "image_count": image_count,
+        "docling_text_len": docling_text_len,
+        "cleaned_docling_text_len": cleaned_docling_text_len,
+        "pdf_text_len": pdf_text_len,
+        "cleaned_pdf_text_len": cleaned_pdf_text_len,
+        "native_text_present": native_text_present,
+        "meaningful_text_present": meaningful_text_present,
+        "artifact_line_ratio": round(artifact_line_ratio, 4),
+        "repetition_ratio": round(repetition_ratio, 4),
+        "junk_text_heavy": junk_text_heavy,
+        "image_dominant": image_dominant,
+        "image_heavy_page": image_heavy_page,
+        "image_only_pdf_page": image_only_pdf_page,
+        "picture_page": picture_page,
+        "table_page": page_no in table_pages,
+        "table_like_page": table_like_page,
+        "page_empty": page_empty,
+    }
+    return route, reasons, metrics
+def _looks_table_like(page_markdown: str) -> bool:
+    """Heuristic detector for tabular pages when Docling doesn't emit TableItem."""
+    lines = [line.strip() for line in page_markdown.splitlines() if line.strip()]
+    if len(lines) < 4:
+        return False
+    header_terms = {
+        "code", "name", "avg", "sq", "sq.", "sqft", "rent", "units", "occupied",
+        "vacant", "notice", "leased", "model", "admin", "trend", "availability",
+        "date", "rate", "%", "unit", "floor", "suite",
+    }
+    header_hits = 0
+    for line in lines[:6]:
+        tokens = re.findall(r"[A-Za-z%\.]+", line.lower())
+        if len(header_terms.intersection(tokens)) >= 4:
+            header_hits += 1
+    numeric_dense_lines = 0
+    for line in lines:
+        numbers = re.findall(r"\d+(?:[.,]\d+)?%?", line)
+        words = re.findall(r"[A-Za-z][A-Za-z\-\/&]*", line)
+        if len(numbers) >= 4 and len(words) >= 2:
+            numeric_dense_lines += 1
+    long_single_block = any(len(line) > 250 and len(re.findall(r"\d+(?:[.,]\d+)?", line)) >= 10 for line in lines)
+    return header_hits >= 1 and (numeric_dense_lines >= 2 or long_single_block)
 def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
     """Save Docling picture images to output dir."""
     return image_count
 def _convert_document(
     input_path: Path,
     output_dir: Path,
     start_page: int = 0,
     end_page: Optional[int] = None,
 ) -> tuple:
+    """
+    Docling-first + Gemini hybrid conversion.
+    Flow:
+    - Docling parses the requested PDF slice with OCR disabled.
+    - Pages route to Gemini when they are table pages or Docling output is too weak.
+    - One parser wins per page; post-processing happens only after merge.
+    """
     overall_start = time.time()
     working_input, page_offset, resolved_end_page, requested_pages = _prepare_input_slice(
         input_path, output_dir, request_id, start_page, end_page
     )
     converter = _get_converter()
+    docling_start = time.time()
     result = converter.convert(working_input)
     doc = result.document
     if doc is None:
         raise ValueError(
             f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
         )
+    docling_time = time.time() - docling_start
+    elements_by_page = _build_elements_by_page(doc)
+    table_pages = _table_pages(doc)
+    picture_pages = _picture_pages(doc)
+    page_markdowns = _extract_page_markdowns(doc, len(result.pages), elements_by_page)
+    pdf_text_by_page = _extract_pdf_text_by_page(working_input)
+    pdf_page_signals = _extract_pdf_page_signals(working_input)
+    routes: dict[int, tuple[str, list[str], dict]] = {}
+    for page in result.pages:
+        page_no = _normalize_docling_page_no(page)
+        page_md = page_markdowns.get(page_no, "")
+        pdf_text = pdf_text_by_page.get(page_no, "")
+        page_pdf_signals = pdf_page_signals.get(page_no, {})
+        if not page_md and pdf_text:
+            page_markdowns[page_no] = pdf_text
+            page_md = pdf_text
+        routes[page_no] = _routing_decision(
+            page_no,
+            page_md,
+            pdf_text,
+            page_pdf_signals,
+            page,
+            table_pages,
+            picture_pages,
+        )
+    gemini_targets = [page_no for page_no, (route, _, _) in routes.items() if route == "gemini"]
+    gemini_target_pages = [page_offset + page_no + 1 for page_no in gemini_targets]
+    logger.info(
+        f"[{request_id}] Pass 1: Docling processed {len(page_markdowns)} pages in {docling_time:.2f}s; "
+        f"table pages: {len(table_pages)}; gemini targets: {len(gemini_targets)}; "
+        f"requested range: {page_offset}-{resolved_end_page if resolved_end_page is not None else 'end'}"
+    )
+    for page_no in sorted(routes):
+        route, reasons, metrics = routes[page_no]
+        logger.info(
+            f"[{request_id}] Route page {page_offset + page_no + 1}: {route}; "
+            f"reasons={reasons or ['docling_default']}; metrics={metrics}"
+        )
+    gemini_page_texts: dict[int, str] = {}
+    render_time = 0.0
+    gemini_time = 0.0
+    if gemini_targets and GEMINI_API_KEY:
+        render_start = time.time()
+        page_images = _pdf_to_page_images(working_input, request_id, 0, None)
+        render_time = time.time() - render_start
+        page_image_map = {pno: pbytes for pno, pbytes in page_images}
+        logger.info(
+            f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(gemini_targets)} routed pages "
+            f"({GEMINI_CONCURRENCY} concurrent)"
+        )
+        gemini_start = time.time()
+        with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
+            futures = {
+                executor.submit(
+                    _gemini_extract_page,
+                    page_image_map[pno],
+                    request_id,
+                    page_offset + pno,
+                ): pno
+                for pno in gemini_targets
+                if pno in page_image_map
+            }
+            for future in as_completed(futures):
+                pno = futures[future]
+                try:
+                    text = future.result()
+                    if text:
+                        gemini_page_texts[pno] = text.strip()
+                        logger.info(
+                            f"[{request_id}] Gemini processed page {page_offset + pno + 1} "
+                            f"({len(text)} chars)"
+                        )
+                except Exception as e:
+                    logger.warning(
+                        f"[{request_id}] Gemini failed page {page_offset + pno + 1}: {e}; "
+                        "falling back to Docling page output"
+                    )
+        gemini_time = time.time() - gemini_start
+    elif gemini_targets and not GEMINI_API_KEY:
+        logger.warning(
+            f"[{request_id}] {len(gemini_targets)} pages routed to Gemini but GEMINI_API_KEY is not set; "
+            "falling back to Docling output"
+        )
+    merged_pages: list[str] = []
+    for page_no in sorted(page_markdowns):
+        page_label = page_offset + page_no + 1
+        content = gemini_page_texts.get(page_no, page_markdowns[page_no]).strip()
+        merged_pages.append(f"--- Page {page_label} ---")
+        if content:
+            merged_pages.append(content)
+    markdown_content = "\n\n".join(merged_pages).strip()
+    post_start = time.time()
+    markdown_content = _post_process_merged_markdown(markdown_content)
+    post_time = time.time() - post_start
     image_count = 0
     if include_images:
         image_count = _save_docling_images(doc, output_dir, request_id)
+    pages_processed = len(page_markdowns) or requested_pages
+    total_time = time.time() - overall_start
     logger.info(
+        f"[{request_id}] Docling+Gemini conversion complete: {pages_processed} pages; "
+        f"Docling {docling_time:.1f}s + Render {render_time:.1f}s + Gemini {gemini_time:.1f}s "
+        f"+ Post {post_time:.1f}s = {total_time:.2f}s total"
     )
+    if pages_processed > 0:
+        logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
+    return markdown_content, None, pages_processed, image_count, gemini_target_pages

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
-# Docling EasyOCR Parser API Dependencies
 docling>=2.15.0
-easyocr>=1.7.2
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.9
@@ -11,3 +10,4 @@ opencv-python-headless>=4.10.0
 pdf2image>=1.17.0
 huggingface-hub>=0.25.0
 pypdf>=5.1.0

+# Docling-first + Gemini Hybrid Parser API Dependencies
 docling>=2.15.0
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.9
 pdf2image>=1.17.0
 huggingface-hub>=0.25.0
 pypdf>=5.1.0
+onnxruntime>=1.19.0