Spaces:
Running on T4
Running on T4
Commit ·
031c76c
1
Parent(s): 53b94dc
feat: v3.2.0 - LaTeX→MD conversion, VLM output cleanup, improved prompt, disable thinking
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Docling VLM Parser API v3.
|
| 3 |
|
| 4 |
A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
|
| 5 |
Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
|
|
@@ -7,7 +7,7 @@ A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
|
|
| 7 |
Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
|
| 8 |
Merge: VLM text for all pages + TableFormer tables where detected
|
| 9 |
|
| 10 |
-
v3.
|
| 11 |
- Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
|
| 12 |
- Quality: VLM retry on timeout/failure (1 retry with longer timeout)
|
| 13 |
- Quality: Table detection catches both markdown and LaTeX table patterns
|
|
@@ -244,7 +244,7 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
|
|
| 244 |
"""Enhance image quality for better OCR accuracy.
|
| 245 |
|
| 246 |
Applies CLAHE contrast enhancement only (fast).
|
| 247 |
-
Denoising was removed in v3.
|
| 248 |
benefit for VLM-based OCR which handles noise well.
|
| 249 |
"""
|
| 250 |
img = cv2.imread(image_path)
|
|
@@ -270,6 +270,67 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
|
|
| 270 |
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 271 |
_THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
|
| 275 |
"""Send a page image to Qwen3-VL via vLLM for text extraction.
|
|
@@ -292,12 +353,17 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
|
|
| 292 |
{
|
| 293 |
"type": "text",
|
| 294 |
"text": (
|
| 295 |
-
"
|
| 296 |
-
"
|
| 297 |
-
"
|
| 298 |
-
"
|
| 299 |
-
"
|
| 300 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
),
|
| 302 |
},
|
| 303 |
],
|
|
@@ -305,6 +371,8 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
|
|
| 305 |
],
|
| 306 |
"max_tokens": 16384,
|
| 307 |
"temperature": 0.1,
|
|
|
|
|
|
|
| 308 |
}
|
| 309 |
|
| 310 |
url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
|
|
@@ -333,8 +401,8 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
|
|
| 333 |
if content is None:
|
| 334 |
raise ValueError("vLLM response missing content")
|
| 335 |
|
| 336 |
-
#
|
| 337 |
-
content =
|
| 338 |
|
| 339 |
return content
|
| 340 |
|
|
@@ -907,7 +975,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
|
|
| 907 |
async def lifespan(app: FastAPI):
|
| 908 |
"""Startup: initialize Docling converter and check vLLM."""
|
| 909 |
logger.info("=" * 60)
|
| 910 |
-
logger.info("Starting Docling VLM Parser API v3.
|
| 911 |
|
| 912 |
device = _get_device()
|
| 913 |
logger.info(f"Device: {device}")
|
|
@@ -959,7 +1027,7 @@ async def lifespan(app: FastAPI):
|
|
| 959 |
app = FastAPI(
|
| 960 |
title="Docling VLM Parser API",
|
| 961 |
description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
|
| 962 |
-
version="3.
|
| 963 |
lifespan=lifespan,
|
| 964 |
)
|
| 965 |
|
|
@@ -984,7 +1052,7 @@ async def health_check() -> HealthResponse:
|
|
| 984 |
|
| 985 |
return HealthResponse(
|
| 986 |
status="healthy",
|
| 987 |
-
version="3.
|
| 988 |
device=device,
|
| 989 |
gpu_name=None,
|
| 990 |
vlm_model="active",
|
|
|
|
| 1 |
"""
|
| 2 |
+
Docling VLM Parser API v3.2.0
|
| 3 |
|
| 4 |
A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
|
| 5 |
Pass 1 (GPU): Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
|
|
|
|
| 7 |
Pass 2 (CPU): Docling TableFormer ONLY on table pages (targeted, minimal)
|
| 8 |
Merge: VLM text for all pages + TableFormer tables where detected
|
| 9 |
|
| 10 |
+
v3.2.0 fixes over v3.0.0:
|
| 11 |
- Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
|
| 12 |
- Quality: VLM retry on timeout/failure (1 retry with longer timeout)
|
| 13 |
- Quality: Table detection catches both markdown and LaTeX table patterns
|
|
|
|
| 244 |
"""Enhance image quality for better OCR accuracy.
|
| 245 |
|
| 246 |
Applies CLAHE contrast enhancement only (fast).
|
| 247 |
+
Denoising was removed in v3.2.0 — it added ~10s/page with minimal
|
| 248 |
benefit for VLM-based OCR which handles noise well.
|
| 249 |
"""
|
| 250 |
img = cv2.imread(image_path)
|
|
|
|
| 270 |
# Strip Qwen3 <think>...</think> reasoning blocks
|
| 271 |
_THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
|
| 272 |
|
| 273 |
+
# Post-processing patterns for VLM output cleanup
|
| 274 |
+
_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
|
| 275 |
+
_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
|
| 276 |
+
_HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
|
| 277 |
+
_PAGE_N_PATTERN = re.compile(r"^\s*Page\s+\d+\s*$\n?", re.MULTILINE)
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def _clean_vlm_output(content: str) -> str:
|
| 281 |
+
"""Post-process VLM output to clean artifacts.
|
| 282 |
+
|
| 283 |
+
Removes: code fences, HTML comments, 'Page N' artifacts,
|
| 284 |
+
and converts any remaining LaTeX tables to markdown format.
|
| 285 |
+
"""
|
| 286 |
+
# Strip <think> blocks
|
| 287 |
+
content = _THINK_PATTERN.sub("", content).strip()
|
| 288 |
+
|
| 289 |
+
# Strip code fence wrappers
|
| 290 |
+
content = _CODE_FENCE_PATTERN.sub("", content)
|
| 291 |
+
content = _CODE_FENCE_END.sub("", content)
|
| 292 |
+
|
| 293 |
+
# Strip HTML comments (VLM sometimes adds coordinate annotations)
|
| 294 |
+
content = _HTML_COMMENT_PATTERN.sub("", content)
|
| 295 |
+
|
| 296 |
+
# Strip "Page N" artifacts
|
| 297 |
+
content = _PAGE_N_PATTERN.sub("", content)
|
| 298 |
+
|
| 299 |
+
# Convert LaTeX tables to markdown if VLM ignores the prompt
|
| 300 |
+
content = _convert_latex_tables_to_markdown(content)
|
| 301 |
+
|
| 302 |
+
return content.strip()
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _convert_latex_tables_to_markdown(text: str) -> str:
|
| 306 |
+
"""Convert LaTeX tabular environments to markdown pipe tables."""
|
| 307 |
+
latex_pattern = re.compile(
|
| 308 |
+
r"\\begin\{tabular\}\{[^}]*\}(.*?)\\end\{tabular\}", re.DOTALL
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
def _latex_to_md(match: re.Match) -> str:
|
| 312 |
+
body = match.group(1)
|
| 313 |
+
# Remove \hline
|
| 314 |
+
body = re.sub(r"\\hline\s*", "", body)
|
| 315 |
+
# Split on \\
|
| 316 |
+
rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
|
| 317 |
+
if not rows:
|
| 318 |
+
return match.group(0)
|
| 319 |
+
|
| 320 |
+
md_rows = []
|
| 321 |
+
for i, row in enumerate(rows):
|
| 322 |
+
cells = [c.strip() for c in row.split("&")]
|
| 323 |
+
md_row = "| " + " | ".join(cells) + " |"
|
| 324 |
+
md_rows.append(md_row)
|
| 325 |
+
if i == 0:
|
| 326 |
+
# Add separator after header
|
| 327 |
+
sep = "| " + " | ".join(["---"] * len(cells)) + " |"
|
| 328 |
+
md_rows.append(sep)
|
| 329 |
+
|
| 330 |
+
return "\n".join(md_rows)
|
| 331 |
+
|
| 332 |
+
return latex_pattern.sub(_latex_to_md, text)
|
| 333 |
+
|
| 334 |
|
| 335 |
def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
|
| 336 |
"""Send a page image to Qwen3-VL via vLLM for text extraction.
|
|
|
|
| 353 |
{
|
| 354 |
"type": "text",
|
| 355 |
"text": (
|
| 356 |
+
"Convert this document page to markdown format.\n\n"
|
| 357 |
+
"Rules:\n"
|
| 358 |
+
"- Extract ALL text content exactly as written\n"
|
| 359 |
+
"- Use ## headings for section titles\n"
|
| 360 |
+
"- Preserve lists, paragraphs, and document structure\n"
|
| 361 |
+
"- Format ALL tables as markdown tables with | delimiters and --- separator rows\n"
|
| 362 |
+
"- NEVER use LaTeX (no \\begin{tabular}, no \\hline, no &)\n"
|
| 363 |
+
"- NEVER wrap output in code fences (no ```)\n"
|
| 364 |
+
"- NEVER add HTML comments or coordinate annotations\n"
|
| 365 |
+
"- For handwritten text, transcribe as accurately as possible\n"
|
| 366 |
+
"- Output ONLY the extracted markdown content, nothing else"
|
| 367 |
),
|
| 368 |
},
|
| 369 |
],
|
|
|
|
| 371 |
],
|
| 372 |
"max_tokens": 16384,
|
| 373 |
"temperature": 0.1,
|
| 374 |
+
# Disable Qwen3 thinking mode to avoid <think> tokens
|
| 375 |
+
"chat_template_kwargs": {"enable_thinking": False},
|
| 376 |
}
|
| 377 |
|
| 378 |
url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
|
|
|
|
| 401 |
if content is None:
|
| 402 |
raise ValueError("vLLM response missing content")
|
| 403 |
|
| 404 |
+
# Clean VLM output (strip think blocks, code fences, HTML comments, convert LaTeX tables)
|
| 405 |
+
content = _clean_vlm_output(content)
|
| 406 |
|
| 407 |
return content
|
| 408 |
|
|
|
|
| 975 |
async def lifespan(app: FastAPI):
|
| 976 |
"""Startup: initialize Docling converter and check vLLM."""
|
| 977 |
logger.info("=" * 60)
|
| 978 |
+
logger.info("Starting Docling VLM Parser API v3.2.0...")
|
| 979 |
|
| 980 |
device = _get_device()
|
| 981 |
logger.info(f"Device: {device}")
|
|
|
|
| 1027 |
app = FastAPI(
|
| 1028 |
title="Docling VLM Parser API",
|
| 1029 |
description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
|
| 1030 |
+
version="3.2.0",
|
| 1031 |
lifespan=lifespan,
|
| 1032 |
)
|
| 1033 |
|
|
|
|
| 1052 |
|
| 1053 |
return HealthResponse(
|
| 1054 |
status="healthy",
|
| 1055 |
+
version="3.2.0",
|
| 1056 |
device=device,
|
| 1057 |
gpu_name=None,
|
| 1058 |
vlm_model="active",
|