sidoutcome commited on
Commit
031c76c
·
1 Parent(s): 53b94dc

feat: v3.2.0 - LaTeX→MD conversion, VLM output cleanup, improved prompt, disable thinking

Browse files
Files changed (1) hide show
  1. app.py +82 -14
app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Docling VLM Parser API v3.1.0
3
 
4
  A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
5
  Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
@@ -7,7 +7,7 @@ A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
7
  Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
8
  Merge: VLM text for all pages + TableFormer tables where detected
9
 
10
- v3.1.0 fixes over v3.0.0:
11
  - Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
12
  - Quality: VLM retry on timeout/failure (1 retry with longer timeout)
13
  - Quality: Table detection catches both markdown and LaTeX table patterns
@@ -244,7 +244,7 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
244
  """Enhance image quality for better OCR accuracy.
245
 
246
  Applies CLAHE contrast enhancement only (fast).
247
- Denoising was removed in v3.1.0 — it added ~10s/page with minimal
248
  benefit for VLM-based OCR which handles noise well.
249
  """
250
  img = cv2.imread(image_path)
@@ -270,6 +270,67 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
270
  # Strip Qwen3 <think>...</think> reasoning blocks
271
  _THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
275
  """Send a page image to Qwen3-VL via vLLM for text extraction.
@@ -292,12 +353,17 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
292
  {
293
  "type": "text",
294
  "text": (
295
- "OCR this document page to markdown. "
296
- "Extract ALL text exactly as written, preserving headings, lists, and paragraphs. "
297
- "For tables, output them as MARKDOWN tables using | delimiters and --- separator rows. "
298
- "NEVER use LaTeX tabular format. ALWAYS use markdown pipe tables. "
299
- "For handwritten text, transcribe as accurately as possible. "
300
- "Return ONLY the extracted content, no explanations or commentary."
 
 
 
 
 
301
  ),
302
  },
303
  ],
@@ -305,6 +371,8 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
305
  ],
306
  "max_tokens": 16384,
307
  "temperature": 0.1,
 
 
308
  }
309
 
310
  url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
@@ -333,8 +401,8 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
333
  if content is None:
334
  raise ValueError("vLLM response missing content")
335
 
336
- # Strip <think>...</think> reasoning blocks from Qwen3
337
- content = _THINK_PATTERN.sub("", content).strip()
338
 
339
  return content
340
 
@@ -907,7 +975,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
907
  async def lifespan(app: FastAPI):
908
  """Startup: initialize Docling converter and check vLLM."""
909
  logger.info("=" * 60)
910
- logger.info("Starting Docling VLM Parser API v3.1.0...")
911
 
912
  device = _get_device()
913
  logger.info(f"Device: {device}")
@@ -959,7 +1027,7 @@ async def lifespan(app: FastAPI):
959
  app = FastAPI(
960
  title="Docling VLM Parser API",
961
  description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
962
- version="3.1.0",
963
  lifespan=lifespan,
964
  )
965
 
@@ -984,7 +1052,7 @@ async def health_check() -> HealthResponse:
984
 
985
  return HealthResponse(
986
  status="healthy",
987
- version="3.1.0",
988
  device=device,
989
  gpu_name=None,
990
  vlm_model="active",
 
1
  """
2
+ Docling VLM Parser API v3.2.0
3
 
4
  A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
5
  Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
 
7
  Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
8
  Merge: VLM text for all pages + TableFormer tables where detected
9
 
10
+ v3.2.0 fixes over v3.0.0:
11
  - Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
12
  - Quality: VLM retry on timeout/failure (1 retry with longer timeout)
13
  - Quality: Table detection catches both markdown and LaTeX table patterns
 
244
  """Enhance image quality for better OCR accuracy.
245
 
246
  Applies CLAHE contrast enhancement only (fast).
247
+ Denoising was removed in v3.2.0 — it added ~10s/page with minimal
248
  benefit for VLM-based OCR which handles noise well.
249
  """
250
  img = cv2.imread(image_path)
 
270
  # Strip Qwen3 <think>...</think> reasoning blocks
271
  _THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
272
 
273
+ # Post-processing patterns for VLM output cleanup
274
+ _CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
275
+ _CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
276
+ _HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
277
+ _PAGE_N_PATTERN = re.compile(r"^\s*Page\s+\d+\s*$\n?", re.MULTILINE)
278
+
279
+
280
+ def _clean_vlm_output(content: str) -> str:
281
+ """Post-process VLM output to clean artifacts.
282
+
283
+ Removes: code fences, HTML comments, 'Page N' artifacts,
284
+ and converts any remaining LaTeX tables to markdown format.
285
+ """
286
+ # Strip <think> blocks
287
+ content = _THINK_PATTERN.sub("", content).strip()
288
+
289
+ # Strip code fence wrappers
290
+ content = _CODE_FENCE_PATTERN.sub("", content)
291
+ content = _CODE_FENCE_END.sub("", content)
292
+
293
+ # Strip HTML comments (VLM sometimes adds coordinate annotations)
294
+ content = _HTML_COMMENT_PATTERN.sub("", content)
295
+
296
+ # Strip "Page N" artifacts
297
+ content = _PAGE_N_PATTERN.sub("", content)
298
+
299
+ # Convert LaTeX tables to markdown if VLM ignores the prompt
300
+ content = _convert_latex_tables_to_markdown(content)
301
+
302
+ return content.strip()
303
+
304
+
305
+ def _convert_latex_tables_to_markdown(text: str) -> str:
306
+ """Convert LaTeX tabular environments to markdown pipe tables."""
307
+ latex_pattern = re.compile(
308
+ r"\\begin\{tabular\}\{[^}]*\}(.*?)\\end\{tabular\}", re.DOTALL
309
+ )
310
+
311
+ def _latex_to_md(match: re.Match) -> str:
312
+ body = match.group(1)
313
+ # Remove \hline
314
+ body = re.sub(r"\\hline\s*", "", body)
315
+ # Split on \\
316
+ rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
317
+ if not rows:
318
+ return match.group(0)
319
+
320
+ md_rows = []
321
+ for i, row in enumerate(rows):
322
+ cells = [c.strip() for c in row.split("&")]
323
+ md_row = "| " + " | ".join(cells) + " |"
324
+ md_rows.append(md_row)
325
+ if i == 0:
326
+ # Add separator after header
327
+ sep = "| " + " | ".join(["---"] * len(cells)) + " |"
328
+ md_rows.append(sep)
329
+
330
+ return "\n".join(md_rows)
331
+
332
+ return latex_pattern.sub(_latex_to_md, text)
333
+
334
 
335
  def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
336
  """Send a page image to Qwen3-VL via vLLM for text extraction.
 
353
  {
354
  "type": "text",
355
  "text": (
356
+ "Convert this document page to markdown format.\n\n"
357
+ "Rules:\n"
358
+ "- Extract ALL text content exactly as written\n"
359
+ "- Use ## headings for section titles\n"
360
+ "- Preserve lists, paragraphs, and document structure\n"
361
+ "- Format ALL tables as markdown tables with | delimiters and --- separator rows\n"
362
+ "- NEVER use LaTeX (no \\begin{tabular}, no \\hline, no &)\n"
363
+ "- NEVER wrap output in code fences (no ```)\n"
364
+ "- NEVER add HTML comments or coordinate annotations\n"
365
+ "- For handwritten text, transcribe as accurately as possible\n"
366
+ "- Output ONLY the extracted markdown content, nothing else"
367
  ),
368
  },
369
  ],
 
371
  ],
372
  "max_tokens": 16384,
373
  "temperature": 0.1,
374
+ # Disable Qwen3 thinking mode to avoid <think> tokens
375
+ "chat_template_kwargs": {"enable_thinking": False},
376
  }
377
 
378
  url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
 
401
  if content is None:
402
  raise ValueError("vLLM response missing content")
403
 
404
+ # Clean VLM output (strip think blocks, code fences, HTML comments, convert LaTeX tables)
405
+ content = _clean_vlm_output(content)
406
 
407
  return content
408
 
 
975
  async def lifespan(app: FastAPI):
976
  """Startup: initialize Docling converter and check vLLM."""
977
  logger.info("=" * 60)
978
+ logger.info("Starting Docling VLM Parser API v3.2.0...")
979
 
980
  device = _get_device()
981
  logger.info(f"Device: {device}")
 
1027
  app = FastAPI(
1028
  title="Docling VLM Parser API",
1029
  description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
1030
+ version="3.2.0",
1031
  lifespan=lifespan,
1032
  )
1033
 
 
1052
 
1053
  return HealthResponse(
1054
  status="healthy",
1055
+ version="3.2.0",
1056
  device=device,
1057
  gpu_name=None,
1058
  vlm_model="active",