Ibad ur Rehman commited on
Commit
c28aa68
·
1 Parent(s): b5db7b1

fix: update docling gemini parser

Browse files
Files changed (3) hide show
  1. app.py +47 -21
  2. pipeline.py +469 -33
  3. requirements.txt +2 -2
app.py CHANGED
@@ -1,8 +1,11 @@
1
  """
2
- Docling OCR Parser API v7.1.0
3
 
4
- A FastAPI service using Docling's standard PDF pipeline with EasyOCR
5
- for PDF parsing.
 
 
 
6
  """
7
 
8
  import asyncio
@@ -20,11 +23,18 @@ from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
20
 
21
  from auth import _validate_url, verify_token
22
  from config import (
 
23
  DOCLING_DEVICE,
24
  DOCLING_NUM_THREADS,
 
 
 
 
25
  IMAGES_SCALE,
26
  MAX_FILE_SIZE_BYTES,
27
  MAX_FILE_SIZE_MB,
 
 
28
  logger,
29
  )
30
  from models import HealthResponse, ParseResponse, URLParseRequest
@@ -41,27 +51,34 @@ from pipeline import (
41
  async def lifespan(app: FastAPI):
42
  """Startup: initialize Docling converter."""
43
  logger.info("=" * 60)
44
- logger.info("Starting Docling OCR Parser API v7.1.0...")
45
- logger.info("Initializing Docling EasyOCR converter...")
46
  _get_converter()
47
- logger.info("Docling EasyOCR converter ready")
48
 
 
49
  logger.info(f"Images scale: {IMAGES_SCALE}")
50
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
51
  logger.info(f"Docling device: {DOCLING_DEVICE}")
52
  logger.info(f"Docling threads: {DOCLING_NUM_THREADS}")
 
 
 
 
 
 
53
 
54
  logger.info("=" * 60)
55
- logger.info("Docling OCR Parser API ready (Docling + EasyOCR)")
56
  logger.info("=" * 60)
57
  yield
58
  logger.info("Shutting down Docling VLM Parser API...")
59
 
60
 
61
  app = FastAPI(
62
- title="Docling OCR Parser API",
63
- description="Docling parser with EasyOCR",
64
- version="7.1.0",
65
  lifespan=lifespan,
66
  )
67
 
@@ -71,9 +88,9 @@ async def health_check() -> HealthResponse:
71
  """Health check endpoint."""
72
  return HealthResponse(
73
  status="healthy",
74
- version="7.1.0",
75
- model="Docling + EasyOCR",
76
- gemini_status="disabled",
77
  images_scale=IMAGES_SCALE,
78
  )
79
 
@@ -88,7 +105,7 @@ async def parse_document(
88
  include_images: bool = Form(default=False, description="Include extracted images"),
89
  _token: str = Depends(verify_token),
90
  ) -> ParseResponse:
91
- """Parse a document file using Docling with EasyOCR."""
92
  request_id = str(uuid4())[:8]
93
  start_time = time.time()
94
 
@@ -159,9 +176,9 @@ async def parse_document(
159
  images_zip=images_zip,
160
  image_count=image_count,
161
  pages_processed=pages_processed,
162
- device_used=DOCLING_DEVICE,
163
- vlm_model="Docling + EasyOCR",
164
- gemini_page_count=0,
165
  gemini_pages=gemini_pages,
166
  )
167
  except Exception as e:
@@ -177,7 +194,7 @@ async def parse_document_from_url(
177
  request: URLParseRequest,
178
  _token: str = Depends(verify_token),
179
  ) -> ParseResponse:
180
- """Parse a document from URL using Docling with EasyOCR."""
181
  request_id = str(uuid4())[:8]
182
  start_time = time.time()
183
 
@@ -245,14 +262,23 @@ async def parse_document_from_url(
245
  images_zip=images_zip,
246
  image_count=image_count,
247
  pages_processed=pages_processed,
248
- device_used=DOCLING_DEVICE,
249
- vlm_model="Docling + EasyOCR",
250
- gemini_page_count=0,
251
  gemini_pages=gemini_pages,
252
  )
 
 
 
253
  except Exception as e:
254
  total_duration = time.time() - start_time
255
  logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
256
  return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
257
  finally:
258
  shutil.rmtree(temp_dir, ignore_errors=True)
 
 
 
 
 
 
 
1
  """
2
+ Docling VLM Parser API v6.0.0
3
 
4
+ A FastAPI service using a Docling-first + Gemini hybrid architecture for
5
+ document parsing:
6
+ Pass 1: Docling on a PDF slice or full input (no OCR)
7
+ Pass 2 (API): Gemini on table pages and weak-text pages
8
+ Post: Cross-page artifact removal, table cleanup, deduplication
9
  """
10
 
11
  import asyncio
 
23
 
24
  from auth import _validate_url, verify_token
25
  from config import (
26
+ BITMAP_AREA_THRESHOLD,
27
  DOCLING_DEVICE,
28
  DOCLING_NUM_THREADS,
29
+ GEMINI_API_KEY,
30
+ GEMINI_CONCURRENCY,
31
+ GEMINI_MODEL,
32
+ IMAGE_DOMINANT_THRESHOLD,
33
  IMAGES_SCALE,
34
  MAX_FILE_SIZE_BYTES,
35
  MAX_FILE_SIZE_MB,
36
+ RENDER_DPI,
37
+ SPARSE_TEXT_THRESHOLD,
38
  logger,
39
  )
40
  from models import HealthResponse, ParseResponse, URLParseRequest
 
51
  async def lifespan(app: FastAPI):
52
  """Startup: initialize Docling converter."""
53
  logger.info("=" * 60)
54
+ logger.info("Starting Docling VLM Parser API v6.0.0...")
55
+ logger.info("Initializing Docling converter...")
56
  _get_converter()
57
+ logger.info("Docling converter ready")
58
 
59
+ logger.info(f"Render DPI: {RENDER_DPI}")
60
  logger.info(f"Images scale: {IMAGES_SCALE}")
61
  logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
62
  logger.info(f"Docling device: {DOCLING_DEVICE}")
63
  logger.info(f"Docling threads: {DOCLING_NUM_THREADS}")
64
+ logger.info(f"Bitmap area threshold: {BITMAP_AREA_THRESHOLD}")
65
+ logger.info(f"Sparse text threshold: {SPARSE_TEXT_THRESHOLD}")
66
+ logger.info(f"Image dominant threshold: {IMAGE_DOMINANT_THRESHOLD}")
67
+ logger.info(f"Gemini Model: {GEMINI_MODEL}")
68
+ logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}")
69
+ logger.info(f"Gemini Concurrency: {GEMINI_CONCURRENCY}")
70
 
71
  logger.info("=" * 60)
72
+ logger.info("Docling VLM Parser API ready (Docling-first + Gemini hybrid)")
73
  logger.info("=" * 60)
74
  yield
75
  logger.info("Shutting down Docling VLM Parser API...")
76
 
77
 
78
  app = FastAPI(
79
+ title="Docling VLM Parser API",
80
+ description="Docling-first + Gemini hybrid parser",
81
+ version="6.0.0",
82
  lifespan=lifespan,
83
  )
84
 
 
88
  """Health check endpoint."""
89
  return HealthResponse(
90
  status="healthy",
91
+ version="6.0.0",
92
+ model="Docling + Gemini",
93
+ gemini_status="configured" if GEMINI_API_KEY else "not set",
94
  images_scale=IMAGES_SCALE,
95
  )
96
 
 
105
  include_images: bool = Form(default=False, description="Include extracted images"),
106
  _token: str = Depends(verify_token),
107
  ) -> ParseResponse:
108
+ """Parse a document file using the hybrid parser."""
109
  request_id = str(uuid4())[:8]
110
  start_time = time.time()
111
 
 
176
  images_zip=images_zip,
177
  image_count=image_count,
178
  pages_processed=pages_processed,
179
+ device_used="cpu",
180
+ vlm_model="Docling + Gemini",
181
+ gemini_page_count=len(gemini_pages),
182
  gemini_pages=gemini_pages,
183
  )
184
  except Exception as e:
 
194
  request: URLParseRequest,
195
  _token: str = Depends(verify_token),
196
  ) -> ParseResponse:
197
+ """Parse a document from URL using the hybrid parser."""
198
  request_id = str(uuid4())[:8]
199
  start_time = time.time()
200
 
 
262
  images_zip=images_zip,
263
  image_count=image_count,
264
  pages_processed=pages_processed,
265
+ device_used="cpu",
266
+ vlm_model="Docling + Gemini",
267
+ gemini_page_count=len(gemini_pages),
268
  gemini_pages=gemini_pages,
269
  )
270
+ except httpx.HTTPError as e:
271
+ logger.error(f"[{request_id}] Download failed: {e}")
272
+ return ParseResponse(success=False, error=f"Failed to download file from URL (ref: {request_id})")
273
  except Exception as e:
274
  total_duration = time.time() - start_time
275
  logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
276
  return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
277
  finally:
278
  shutil.rmtree(temp_dir, ignore_errors=True)
279
+
280
+
281
+ if __name__ == "__main__":
282
+ import uvicorn
283
+
284
+ uvicorn.run(app, host="0.0.0.0", port=7860)
pipeline.py CHANGED
@@ -1,4 +1,4 @@
1
- """Docling EasyOCR pipeline and file helpers."""
2
 
3
  import base64
4
  import io
@@ -6,39 +6,64 @@ import re
6
  import shutil
7
  import time
8
  import zipfile
 
9
  from pathlib import Path
10
  from typing import BinaryIO, Optional
11
 
 
12
  from docling.datamodel.base_models import InputFormat
13
- from docling.datamodel.document import PictureItem
14
  from docling.datamodel.pipeline_options import (
15
  AcceleratorOptions,
16
- EasyOcrOptions,
17
  PdfPipelineOptions,
18
  TableFormerMode,
19
- TableStructureOptions,
20
  )
21
  from docling.document_converter import DocumentConverter, PdfFormatOption
22
  from pypdf import PdfReader, PdfWriter
23
 
24
- from config import DOCLING_DEVICE, DOCLING_NUM_THREADS, IMAGES_SCALE, logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  _converter = None
 
27
  _INLINE_DATA_IMAGE = re.compile(r"!\[[^\]]*\]\(data:image/[^)]+\)", re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  def _get_converter():
31
- """Get or create the global Docling converter instance with EasyOCR."""
32
  global _converter
33
  if _converter is None:
34
  pipeline_options = PdfPipelineOptions()
35
- pipeline_options.do_ocr = True
36
- pipeline_options.ocr_options = EasyOcrOptions(lang=["en"])
37
  pipeline_options.do_table_structure = True
38
- pipeline_options.table_structure_options = TableStructureOptions(
39
- do_cell_matching=True,
40
- mode=TableFormerMode.ACCURATE,
41
- )
42
  pipeline_options.images_scale = IMAGES_SCALE
43
  pipeline_options.accelerator_options = AcceleratorOptions(
44
  device=DOCLING_DEVICE,
@@ -49,6 +74,7 @@ def _get_converter():
49
  format_options={
50
  InputFormat.PDF: PdfFormatOption(
51
  pipeline_options=pipeline_options,
 
52
  )
53
  }
54
  )
@@ -60,13 +86,11 @@ def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
60
  shutil.copyfileobj(file_obj, f)
61
 
62
 
63
-
64
  def _save_downloaded_content(input_path: Path, content: bytes) -> None:
65
  with open(input_path, "wb") as f:
66
  f.write(content)
67
 
68
 
69
-
70
  def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
71
  """Create a zip file from extracted images."""
72
  image_dir = output_dir / "images"
@@ -90,13 +114,62 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
90
  return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
91
 
92
 
93
-
94
  def _resolve_pdf_page_count(input_path: Path) -> int:
95
  from pdf2image.pdf2image import pdfinfo_from_path
96
 
97
  return int(pdfinfo_from_path(str(input_path))["Pages"])
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  def _prepare_input_slice(
102
  input_path: Path,
@@ -136,6 +209,265 @@ def _prepare_input_slice(
136
  return slice_path, start_page, last_page, requested_pages
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
141
  """Save Docling picture images to output dir."""
@@ -160,14 +492,6 @@ def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
160
  return image_count
161
 
162
 
163
-
164
- def _clean_markdown(markdown: str) -> str:
165
- markdown = _INLINE_DATA_IMAGE.sub("", markdown)
166
- markdown = re.sub(r"\n{3,}", "\n\n", markdown)
167
- return markdown.strip()
168
-
169
-
170
-
171
  def _convert_document(
172
  input_path: Path,
173
  output_dir: Path,
@@ -176,34 +500,146 @@ def _convert_document(
176
  start_page: int = 0,
177
  end_page: Optional[int] = None,
178
  ) -> tuple:
179
- """Convert a PDF slice using Docling with EasyOCR."""
 
 
 
 
 
 
 
180
  overall_start = time.time()
181
  working_input, page_offset, resolved_end_page, requested_pages = _prepare_input_slice(
182
  input_path, output_dir, request_id, start_page, end_page
183
  )
184
 
185
  converter = _get_converter()
186
- convert_start = time.time()
187
  result = converter.convert(working_input)
188
  doc = result.document
189
  if doc is None:
190
  raise ValueError(
191
  f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
192
  )
193
- convert_time = time.time() - convert_start
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- markdown_content = _clean_markdown(doc.export_to_markdown())
 
 
196
 
197
  image_count = 0
198
  if include_images:
199
  image_count = _save_docling_images(doc, output_dir, request_id)
200
 
201
- total_duration = time.time() - overall_start
 
202
  logger.info(
203
- f"[{request_id}] Docling EasyOCR conversion complete: {requested_pages} pages; "
204
- f"Docling {convert_time:.2f}s total"
 
205
  )
206
- if total_duration > 0:
207
- logger.info(f"[{request_id}] Speed: {requested_pages / total_duration:.2f} pages/sec")
208
 
209
- return markdown_content, None, requested_pages, image_count, []
 
1
+ """Docling-first pipeline, Gemini routing, and file helpers."""
2
 
3
  import base64
4
  import io
 
6
  import shutil
7
  import time
8
  import zipfile
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
  from pathlib import Path
11
  from typing import BinaryIO, Optional
12
 
13
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
  from docling.datamodel.base_models import InputFormat
15
+ from docling.datamodel.document import PictureItem, TableItem
16
  from docling.datamodel.pipeline_options import (
17
  AcceleratorOptions,
 
18
  PdfPipelineOptions,
19
  TableFormerMode,
 
20
  )
21
  from docling.document_converter import DocumentConverter, PdfFormatOption
22
  from pypdf import PdfReader, PdfWriter
23
 
24
+ from config import (
25
+ BITMAP_AREA_THRESHOLD,
26
+ DOCLING_DEVICE,
27
+ DOCLING_NUM_THREADS,
28
+ GEMINI_API_KEY,
29
+ GEMINI_CONCURRENCY,
30
+ GEMINI_MODEL,
31
+ IMAGE_DOMINANT_THRESHOLD,
32
+ IMAGES_SCALE,
33
+ SPARSE_TEXT_THRESHOLD,
34
+ logger,
35
+ )
36
+ from gemini import _gemini_extract_page
37
+ from postprocess import _post_process_merged_markdown
38
+ from rendering import _pdf_to_page_images
39
 
40
  _converter = None
41
+ _PAGE_MARKER = re.compile(r"^\s*---\s*Page\s+\d+\s*---\s*$", re.MULTILINE)
42
  _INLINE_DATA_IMAGE = re.compile(r"!\[[^\]]*\]\(data:image/[^)]+\)", re.IGNORECASE)
43
+ _ABOUT_BLANK_LINE = re.compile(r"^\s*about:blank(?:\s+\d+/\d+)?\s*$", re.IGNORECASE)
44
+ _BROWSER_PRINT_LINE = re.compile(
45
+ r"^\s*\d{1,2}/\d{1,2}/\d{2,4},\s+\d{1,2}:\d{2}\s*(?:AM|PM)\b.*$",
46
+ re.IGNORECASE,
47
+ )
48
+ _PAGE_COUNTER_LINE = re.compile(r"^\s*\d+\s*/\s*\d+\s*$")
49
+ _URLISH_LINE = re.compile(r"^\s*(?:https?://|www\.)\S+\s*$", re.IGNORECASE)
50
+ _SHORT_ARTIFACT_LINE = re.compile(
51
+ r"^\s*(?:printed from|generated by|page \d+ of \d+|page \d+)\s*$",
52
+ re.IGNORECASE,
53
+ )
54
 
55
 
56
  def _get_converter():
57
+ """Get or create the global Docling converter instance."""
58
  global _converter
59
  if _converter is None:
60
  pipeline_options = PdfPipelineOptions()
61
+ pipeline_options.do_ocr = False
 
62
  pipeline_options.do_table_structure = True
63
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
64
+ pipeline_options.table_structure_options.do_cell_matching = True
65
+ pipeline_options.generate_page_images = False
66
+ pipeline_options.generate_picture_images = True
67
  pipeline_options.images_scale = IMAGES_SCALE
68
  pipeline_options.accelerator_options = AcceleratorOptions(
69
  device=DOCLING_DEVICE,
 
74
  format_options={
75
  InputFormat.PDF: PdfFormatOption(
76
  pipeline_options=pipeline_options,
77
+ backend=DoclingParseDocumentBackend,
78
  )
79
  }
80
  )
 
86
  shutil.copyfileobj(file_obj, f)
87
 
88
 
 
89
  def _save_downloaded_content(input_path: Path, content: bytes) -> None:
90
  with open(input_path, "wb") as f:
91
  f.write(content)
92
 
93
 
 
94
  def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
95
  """Create a zip file from extracted images."""
96
  image_dir = output_dir / "images"
 
114
  return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
115
 
116
 
 
117
  def _resolve_pdf_page_count(input_path: Path) -> int:
118
  from pdf2image.pdf2image import pdfinfo_from_path
119
 
120
  return int(pdfinfo_from_path(str(input_path))["Pages"])
121
 
122
 
123
+ def _extract_pdf_text_by_page(input_path: Path) -> dict[int, str]:
124
+ """Extract native PDF text per page using pypdf as a routing fallback."""
125
+ if input_path.suffix.lower() != ".pdf":
126
+ return {}
127
+
128
+ text_by_page: dict[int, str] = {}
129
+ try:
130
+ reader = PdfReader(str(input_path))
131
+ for page_no, page in enumerate(reader.pages):
132
+ try:
133
+ text_by_page[page_no] = (page.extract_text() or "").strip()
134
+ except Exception:
135
+ text_by_page[page_no] = ""
136
+ except Exception:
137
+ return {}
138
+ return text_by_page
139
+
140
+
141
+ def _extract_pdf_page_signals(input_path: Path) -> dict[int, dict[str, bool]]:
142
+ """Inspect PDF structure per page for native text and image-backed pages."""
143
+ if input_path.suffix.lower() != ".pdf":
144
+ return {}
145
+
146
+ page_signals: dict[int, dict[str, bool]] = {}
147
+ try:
148
+ reader = PdfReader(str(input_path))
149
+ for page_no, page in enumerate(reader.pages):
150
+ has_fonts = False
151
+ image_count = 0
152
+ try:
153
+ resources = page.get("/Resources") or {}
154
+ font_resources = resources.get("/Font")
155
+ has_fonts = bool(font_resources)
156
+ xobjects = resources.get("/XObject") or {}
157
+ for xobj in xobjects.values():
158
+ subtype = xobj.get("/Subtype")
159
+ if subtype == "/Image":
160
+ image_count += 1
161
+ except Exception:
162
+ pass
163
+ page_signals[page_no] = {
164
+ "has_fonts": has_fonts,
165
+ "has_images": image_count > 0,
166
+ "image_count": image_count,
167
+ "image_only_pdf_page": image_count > 0 and not has_fonts,
168
+ }
169
+ except Exception:
170
+ return {}
171
+ return page_signals
172
+
173
 
174
  def _prepare_input_slice(
175
  input_path: Path,
 
209
  return slice_path, start_page, last_page, requested_pages
210
 
211
 
212
+ def _bitmap_coverage(page) -> float:
213
+ """Compute bitmap coverage ratio for a Docling page."""
214
+ try:
215
+ if page._backend is None or page.size is None:
216
+ return 0.0
217
+ bitmap_rects = page._backend.get_bitmap_rects()
218
+ if not bitmap_rects:
219
+ return 0.0
220
+ page_area = page.size.width * page.size.height
221
+ if page_area <= 0:
222
+ return 0.0
223
+ bitmap_area = sum(max(0.0, rect.area()) for rect in bitmap_rects)
224
+ return min(1.0, bitmap_area / page_area)
225
+ except Exception:
226
+ return 0.0
227
+
228
+
229
+ def _page_has_native_text(page) -> bool:
230
+ """Check whether Docling extracted meaningful native text on a page."""
231
+ try:
232
+ return any(
233
+ getattr(cell, "text", "").strip() and not getattr(cell, "from_ocr", False)
234
+ for cell in page.cells
235
+ )
236
+ except Exception:
237
+ return False
238
+
239
+
240
+ def _artifact_line_count(text: str) -> int:
241
+ count = 0
242
+ for line in text.splitlines():
243
+ stripped = line.strip()
244
+ if not stripped:
245
+ continue
246
+ if (
247
+ _ABOUT_BLANK_LINE.match(stripped)
248
+ or _BROWSER_PRINT_LINE.match(stripped)
249
+ or _PAGE_COUNTER_LINE.match(stripped)
250
+ or _URLISH_LINE.match(stripped)
251
+ or _SHORT_ARTIFACT_LINE.match(stripped)
252
+ ):
253
+ count += 1
254
+ return count
255
+
256
+
257
+ def _clean_extracted_text(text: str) -> str:
258
+ cleaned_lines: list[str] = []
259
+ for line in text.splitlines():
260
+ stripped = line.strip()
261
+ if not stripped:
262
+ continue
263
+ if (
264
+ _ABOUT_BLANK_LINE.match(stripped)
265
+ or _BROWSER_PRINT_LINE.match(stripped)
266
+ or _PAGE_COUNTER_LINE.match(stripped)
267
+ or _URLISH_LINE.match(stripped)
268
+ or _SHORT_ARTIFACT_LINE.match(stripped)
269
+ ):
270
+ continue
271
+ cleaned_lines.append(stripped)
272
+ return "\n".join(cleaned_lines).strip()
273
+
274
+
275
+ def _repetition_ratio(text: str) -> float:
276
+ lines = [line.strip().lower() for line in text.splitlines() if line.strip()]
277
+ if not lines:
278
+ return 0.0
279
+ unique = len(set(lines))
280
+ return 1.0 - (unique / len(lines))
281
+
282
+
283
+ def _looks_meaningful_text(cleaned_text: str) -> bool:
284
+ compact = re.sub(r"\s+", "", cleaned_text)
285
+ words = re.findall(r"[A-Za-z][A-Za-z0-9'&/-]*", cleaned_text)
286
+ return len(compact) >= 120 or len(set(w.lower() for w in words)) >= 20
287
+
288
+
289
+ def _build_page_markdown(doc, page_no: int, elements_by_page: dict[int, list]) -> str:
290
+ """Build markdown for one page from Docling items grouped by provenance."""
291
+ parts: list[str] = []
292
+ for element in elements_by_page.get(page_no, []):
293
+ if isinstance(element, PictureItem):
294
+ continue
295
+ try:
296
+ parts.append(element.export_to_markdown(doc=doc))
297
+ except Exception:
298
+ text = getattr(element, "text", "").strip()
299
+ if text:
300
+ parts.append(text)
301
+ page_md = "\n\n".join(p for p in parts if p and p.strip())
302
+ page_md = _INLINE_DATA_IMAGE.sub("", page_md)
303
+ return _PAGE_MARKER.sub("", page_md).strip()
304
+
305
+
306
+ def _extract_page_markdowns(doc, page_count: int, elements_by_page: dict[int, list]) -> dict[int, str]:
307
+ """Build per-page markdown strictly from Docling provenance."""
308
+ return {
309
+ page_no: _build_page_markdown(doc, page_no, elements_by_page)
310
+ for page_no in range(page_count)
311
+ }
312
+
313
+
314
+ def _normalize_docling_page_no(page) -> int:
315
+ """Normalize Docling page numbering to a zero-based index."""
316
+ raw_page_no = int(page.page_no)
317
+ return raw_page_no - 1 if raw_page_no > 0 else raw_page_no
318
+
319
+
320
+ def _table_pages(doc) -> set[int]:
321
+ """Collect Docling-detected table page numbers."""
322
+ pages: set[int] = set()
323
+ for element, _ in doc.iterate_items():
324
+ if isinstance(element, TableItem) and element.prov:
325
+ prov_page_no = int(element.prov[0].page_no)
326
+ pages.add(prov_page_no - 1 if prov_page_no > 0 else prov_page_no)
327
+ return pages
328
+
329
+
330
+ def _picture_pages(doc) -> set[int]:
331
+ """Collect pages containing Docling picture items."""
332
+ pages: set[int] = set()
333
+ for element, _ in doc.iterate_items():
334
+ if isinstance(element, PictureItem) and element.prov:
335
+ prov_page_no = int(element.prov[0].page_no)
336
+ pages.add(prov_page_no - 1 if prov_page_no > 0 else prov_page_no)
337
+ return pages
338
+
339
+
340
+ def _build_elements_by_page(doc) -> dict[int, list]:
341
+ """Index Docling elements by page number."""
342
+ elements_by_page: dict[int, list] = {}
343
+ for element, _ in doc.iterate_items():
344
+ if element.prov:
345
+ prov_page_no = int(element.prov[0].page_no)
346
+ page_no = prov_page_no - 1 if prov_page_no > 0 else prov_page_no
347
+ elements_by_page.setdefault(page_no, []).append(element)
348
+ return elements_by_page
349
+
350
+
351
+ def _routing_decision(
352
+ page_no: int,
353
+ page_markdown: str,
354
+ pdf_text: str,
355
+ page_pdf_signals: dict[str, bool],
356
+ page,
357
+ table_pages: set[int],
358
+ picture_pages: set[int],
359
+ ) -> tuple[str, list[str], dict]:
360
+ """Decide whether a page should use Docling or Gemini."""
361
+ bitmap_coverage = _bitmap_coverage(page)
362
+ image_only_pdf_page = page_pdf_signals.get("image_only_pdf_page", False)
363
+ image_count = int(page_pdf_signals.get("image_count", 0) or 0)
364
+ has_images = bool(page_pdf_signals.get("has_images", False))
365
+ effective_bitmap_coverage = max(bitmap_coverage, 1.0 if image_only_pdf_page else 0.0)
366
+ image_dominant = bitmap_coverage >= IMAGE_DOMINANT_THRESHOLD
367
+ pdf_text_len = len(re.sub(r"\s+", "", pdf_text))
368
+ docling_native_text = _page_has_native_text(page)
369
+ cleaned_docling_text = _clean_extracted_text(page_markdown)
370
+ cleaned_pdf_text = _clean_extracted_text(pdf_text)
371
+ cleaned_docling_text_len = len(re.sub(r"\s+", "", cleaned_docling_text))
372
+ cleaned_pdf_text_len = len(re.sub(r"\s+", "", cleaned_pdf_text))
373
+ raw_routing_text = page_markdown if page_markdown.strip() else pdf_text
374
+ routing_text = raw_routing_text
375
+ artifact_lines = _artifact_line_count(routing_text)
376
+ nonempty_lines = max(1, len([line for line in routing_text.splitlines() if line.strip()]))
377
+ artifact_line_ratio = artifact_lines / nonempty_lines
378
+ repetition_ratio = _repetition_ratio(routing_text)
379
+ meaningful_text_present = (
380
+ _looks_meaningful_text(cleaned_docling_text)
381
+ or _looks_meaningful_text(cleaned_pdf_text)
382
+ )
383
+ native_text_present = docling_native_text or meaningful_text_present
384
+ docling_text_len = len(re.sub(r"\s+", "", routing_text))
385
+ page_empty = docling_text_len == 0
386
+ table_like_page = _looks_table_like(routing_text)
387
+ picture_page = page_no in picture_pages
388
+ junk_text_heavy = artifact_line_ratio >= 0.5 or repetition_ratio >= 0.45
389
+ image_heavy_page = (
390
+ image_only_pdf_page
391
+ or image_dominant
392
+ or effective_bitmap_coverage > BITMAP_AREA_THRESHOLD
393
+ or image_count >= 3
394
+ or (has_images and not meaningful_text_present)
395
+ )
396
+ low_quality_docling_output = (
397
+ page_empty
398
+ or junk_text_heavy
399
+ or (docling_text_len < SPARSE_TEXT_THRESHOLD and has_images and not meaningful_text_present)
400
+ )
401
+
402
+ reasons: list[str] = []
403
+ if page_no in table_pages:
404
+ reasons.append("table_page")
405
+ if table_like_page:
406
+ reasons.append("table_like_page")
407
+ if picture_page and not meaningful_text_present:
408
+ reasons.append("picture_without_native_text")
409
+ if page_empty:
410
+ reasons.append("docling_empty")
411
+ if image_heavy_page and not meaningful_text_present:
412
+ reasons.append("image_heavy_weak_text")
413
+ if junk_text_heavy and has_images:
414
+ reasons.append("junk_text_with_images")
415
+ if low_quality_docling_output and has_images:
416
+ reasons.append("low_quality_docling_output")
417
+
418
+ route = "gemini" if reasons else "docling"
419
+ metrics = {
420
+ "bitmap_coverage": round(bitmap_coverage, 4),
421
+ "effective_bitmap_coverage": round(effective_bitmap_coverage, 4),
422
+ "image_count": image_count,
423
+ "docling_text_len": docling_text_len,
424
+ "cleaned_docling_text_len": cleaned_docling_text_len,
425
+ "pdf_text_len": pdf_text_len,
426
+ "cleaned_pdf_text_len": cleaned_pdf_text_len,
427
+ "native_text_present": native_text_present,
428
+ "meaningful_text_present": meaningful_text_present,
429
+ "artifact_line_ratio": round(artifact_line_ratio, 4),
430
+ "repetition_ratio": round(repetition_ratio, 4),
431
+ "junk_text_heavy": junk_text_heavy,
432
+ "image_dominant": image_dominant,
433
+ "image_heavy_page": image_heavy_page,
434
+ "image_only_pdf_page": image_only_pdf_page,
435
+ "picture_page": picture_page,
436
+ "table_page": page_no in table_pages,
437
+ "table_like_page": table_like_page,
438
+ "page_empty": page_empty,
439
+ }
440
+ return route, reasons, metrics
441
+
442
+
443
+ def _looks_table_like(page_markdown: str) -> bool:
444
+ """Heuristic detector for tabular pages when Docling doesn't emit TableItem."""
445
+ lines = [line.strip() for line in page_markdown.splitlines() if line.strip()]
446
+ if len(lines) < 4:
447
+ return False
448
+
449
+ header_terms = {
450
+ "code", "name", "avg", "sq", "sq.", "sqft", "rent", "units", "occupied",
451
+ "vacant", "notice", "leased", "model", "admin", "trend", "availability",
452
+ "date", "rate", "%", "unit", "floor", "suite",
453
+ }
454
+ header_hits = 0
455
+ for line in lines[:6]:
456
+ tokens = re.findall(r"[A-Za-z%\.]+", line.lower())
457
+ if len(header_terms.intersection(tokens)) >= 4:
458
+ header_hits += 1
459
+
460
+ numeric_dense_lines = 0
461
+ for line in lines:
462
+ numbers = re.findall(r"\d+(?:[.,]\d+)?%?", line)
463
+ words = re.findall(r"[A-Za-z][A-Za-z\-\/&]*", line)
464
+ if len(numbers) >= 4 and len(words) >= 2:
465
+ numeric_dense_lines += 1
466
+
467
+ long_single_block = any(len(line) > 250 and len(re.findall(r"\d+(?:[.,]\d+)?", line)) >= 10 for line in lines)
468
+
469
+ return header_hits >= 1 and (numeric_dense_lines >= 2 or long_single_block)
470
+
471
 
472
  def _save_docling_images(doc, output_dir: Path, request_id: str) -> int:
473
  """Save Docling picture images to output dir."""
 
492
  return image_count
493
 
494
 
 
 
 
 
 
 
 
 
495
  def _convert_document(
496
  input_path: Path,
497
  output_dir: Path,
 
500
  start_page: int = 0,
501
  end_page: Optional[int] = None,
502
  ) -> tuple:
503
+ """
504
+ Docling-first + Gemini hybrid conversion.
505
+
506
+ Flow:
507
+ - Docling parses the requested PDF slice with OCR disabled.
508
+ - Pages route to Gemini when they are table pages or Docling output is too weak.
509
+ - One parser wins per page; post-processing happens only after merge.
510
+ """
511
  overall_start = time.time()
512
  working_input, page_offset, resolved_end_page, requested_pages = _prepare_input_slice(
513
  input_path, output_dir, request_id, start_page, end_page
514
  )
515
 
516
  converter = _get_converter()
517
+ docling_start = time.time()
518
  result = converter.convert(working_input)
519
  doc = result.document
520
  if doc is None:
521
  raise ValueError(
522
  f"Docling failed to parse document (status: {getattr(result, 'status', 'unknown')})"
523
  )
524
+ docling_time = time.time() - docling_start
525
+
526
+ elements_by_page = _build_elements_by_page(doc)
527
+ table_pages = _table_pages(doc)
528
+ picture_pages = _picture_pages(doc)
529
+
530
+ page_markdowns = _extract_page_markdowns(doc, len(result.pages), elements_by_page)
531
+ pdf_text_by_page = _extract_pdf_text_by_page(working_input)
532
+ pdf_page_signals = _extract_pdf_page_signals(working_input)
533
+ routes: dict[int, tuple[str, list[str], dict]] = {}
534
+ for page in result.pages:
535
+ page_no = _normalize_docling_page_no(page)
536
+ page_md = page_markdowns.get(page_no, "")
537
+ pdf_text = pdf_text_by_page.get(page_no, "")
538
+ page_pdf_signals = pdf_page_signals.get(page_no, {})
539
+ if not page_md and pdf_text:
540
+ page_markdowns[page_no] = pdf_text
541
+ page_md = pdf_text
542
+ routes[page_no] = _routing_decision(
543
+ page_no,
544
+ page_md,
545
+ pdf_text,
546
+ page_pdf_signals,
547
+ page,
548
+ table_pages,
549
+ picture_pages,
550
+ )
551
+
552
+ gemini_targets = [page_no for page_no, (route, _, _) in routes.items() if route == "gemini"]
553
+ gemini_target_pages = [page_offset + page_no + 1 for page_no in gemini_targets]
554
+
555
+ logger.info(
556
+ f"[{request_id}] Pass 1: Docling processed {len(page_markdowns)} pages in {docling_time:.2f}s; "
557
+ f"table pages: {len(table_pages)}; gemini targets: {len(gemini_targets)}; "
558
+ f"requested range: {page_offset}-{resolved_end_page if resolved_end_page is not None else 'end'}"
559
+ )
560
+
561
+ for page_no in sorted(routes):
562
+ route, reasons, metrics = routes[page_no]
563
+ logger.info(
564
+ f"[{request_id}] Route page {page_offset + page_no + 1}: {route}; "
565
+ f"reasons={reasons or ['docling_default']}; metrics={metrics}"
566
+ )
567
+
568
+ gemini_page_texts: dict[int, str] = {}
569
+ render_time = 0.0
570
+ gemini_time = 0.0
571
+
572
+ if gemini_targets and GEMINI_API_KEY:
573
+ render_start = time.time()
574
+ page_images = _pdf_to_page_images(working_input, request_id, 0, None)
575
+ render_time = time.time() - render_start
576
+ page_image_map = {pno: pbytes for pno, pbytes in page_images}
577
+
578
+ logger.info(
579
+ f"[{request_id}] Pass 2: Gemini {GEMINI_MODEL} on {len(gemini_targets)} routed pages "
580
+ f"({GEMINI_CONCURRENCY} concurrent)"
581
+ )
582
+
583
+ gemini_start = time.time()
584
+ with ThreadPoolExecutor(max_workers=GEMINI_CONCURRENCY) as executor:
585
+ futures = {
586
+ executor.submit(
587
+ _gemini_extract_page,
588
+ page_image_map[pno],
589
+ request_id,
590
+ page_offset + pno,
591
+ ): pno
592
+ for pno in gemini_targets
593
+ if pno in page_image_map
594
+ }
595
+ for future in as_completed(futures):
596
+ pno = futures[future]
597
+ try:
598
+ text = future.result()
599
+ if text:
600
+ gemini_page_texts[pno] = text.strip()
601
+ logger.info(
602
+ f"[{request_id}] Gemini processed page {page_offset + pno + 1} "
603
+ f"({len(text)} chars)"
604
+ )
605
+ except Exception as e:
606
+ logger.warning(
607
+ f"[{request_id}] Gemini failed page {page_offset + pno + 1}: {e}; "
608
+ "falling back to Docling page output"
609
+ )
610
+ gemini_time = time.time() - gemini_start
611
+ elif gemini_targets and not GEMINI_API_KEY:
612
+ logger.warning(
613
+ f"[{request_id}] {len(gemini_targets)} pages routed to Gemini but GEMINI_API_KEY is not set; "
614
+ "falling back to Docling output"
615
+ )
616
+
617
+ merged_pages: list[str] = []
618
+ for page_no in sorted(page_markdowns):
619
+ page_label = page_offset + page_no + 1
620
+ content = gemini_page_texts.get(page_no, page_markdowns[page_no]).strip()
621
+ merged_pages.append(f"--- Page {page_label} ---")
622
+ if content:
623
+ merged_pages.append(content)
624
+
625
+ markdown_content = "\n\n".join(merged_pages).strip()
626
 
627
+ post_start = time.time()
628
+ markdown_content = _post_process_merged_markdown(markdown_content)
629
+ post_time = time.time() - post_start
630
 
631
  image_count = 0
632
  if include_images:
633
  image_count = _save_docling_images(doc, output_dir, request_id)
634
 
635
+ pages_processed = len(page_markdowns) or requested_pages
636
+ total_time = time.time() - overall_start
637
  logger.info(
638
+ f"[{request_id}] Docling+Gemini conversion complete: {pages_processed} pages; "
639
+ f"Docling {docling_time:.1f}s + Render {render_time:.1f}s + Gemini {gemini_time:.1f}s "
640
+ f"+ Post {post_time:.1f}s = {total_time:.2f}s total"
641
  )
642
+ if pages_processed > 0:
643
+ logger.info(f"[{request_id}] Speed: {pages_processed / total_time:.2f} pages/sec")
644
 
645
+ return markdown_content, None, pages_processed, image_count, gemini_target_pages
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
- # Docling EasyOCR Parser API Dependencies
2
 
3
  docling>=2.15.0
4
- easyocr>=1.7.2
5
  fastapi>=0.115.0
6
  uvicorn[standard]>=0.32.0
7
  python-multipart>=0.0.9
@@ -11,3 +10,4 @@ opencv-python-headless>=4.10.0
11
  pdf2image>=1.17.0
12
  huggingface-hub>=0.25.0
13
  pypdf>=5.1.0
 
 
1
+ # Docling-first + Gemini Hybrid Parser API Dependencies
2
 
3
  docling>=2.15.0
 
4
  fastapi>=0.115.0
5
  uvicorn[standard]>=0.32.0
6
  python-multipart>=0.0.9
 
10
  pdf2image>=1.17.0
11
  huggingface-hub>=0.25.0
12
  pypdf>=5.1.0
13
+ onnxruntime>=1.19.0