Spaces:

outcomelabs
/

docling-parser

Running on T4

App Files Files Community

sidoutcome commited on Mar 13

Commit

031c76c

1 Parent(s): 53b94dc

feat: v3.2.0 - LaTeX→MD conversion, VLM output cleanup, improved prompt, disable thinking

Browse files

Files changed (1) hide show

app.py +82 -14

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Docling VLM Parser API v3.1.0
 A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
   Pass 1 (GPU):  Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
@@ -7,7 +7,7 @@ A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
   Pass 2 (CPU):  Docling TableFormer ONLY on table pages (targeted, minimal)
   Merge:         VLM text for all pages + TableFormer tables where detected
-v3.1.0 fixes over v3.0.0:
   - Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
   - Quality: VLM retry on timeout/failure (1 retry with longer timeout)
   - Quality: Table detection catches both markdown and LaTeX table patterns
@@ -244,7 +244,7 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
     """Enhance image quality for better OCR accuracy.
     Applies CLAHE contrast enhancement only (fast).
-    Denoising was removed in v3.1.0 — it added ~10s/page with minimal
     benefit for VLM-based OCR which handles noise well.
     """
     img = cv2.imread(image_path)
@@ -270,6 +270,67 @@ def _preprocess_image_for_ocr(image_path: str) -> str:
 # Strip Qwen3 <think>...</think> reasoning blocks
 _THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
 def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
     """Send a page image to Qwen3-VL via vLLM for text extraction.
@@ -292,12 +353,17 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
                     {
                         "type": "text",
                         "text": (
-                            "OCR this document page to markdown. "
-                            "Extract ALL text exactly as written, preserving headings, lists, and paragraphs. "
-                            "For tables, output them as MARKDOWN tables using | delimiters and --- separator rows. "
-                            "NEVER use LaTeX tabular format. ALWAYS use markdown pipe tables. "
-                            "For handwritten text, transcribe as accurately as possible. "
-                            "Return ONLY the extracted content, no explanations or commentary."
                         ),
                     },
                 ],
@@ -305,6 +371,8 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
         ],
         "max_tokens": 16384,
         "temperature": 0.1,
     }
     url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
@@ -333,8 +401,8 @@ def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int =
             if content is None:
                 raise ValueError("vLLM response missing content")
-            # Strip <think>...</think> reasoning blocks from Qwen3
-            content = _THINK_PATTERN.sub("", content).strip()
             return content
@@ -907,7 +975,7 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
 async def lifespan(app: FastAPI):
     """Startup: initialize Docling converter and check vLLM."""
     logger.info("=" * 60)
-    logger.info("Starting Docling VLM Parser API v3.1.0...")
     device = _get_device()
     logger.info(f"Device: {device}")
@@ -959,7 +1027,7 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
     title="Docling VLM Parser API",
     description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
-    version="3.1.0",
     lifespan=lifespan,
 )
@@ -984,7 +1052,7 @@ async def health_check() -> HealthResponse:
     return HealthResponse(
         status="healthy",
-        version="3.1.0",
         device=device,
         gpu_name=None,
         vlm_model="active",

 """
+Docling VLM Parser API v3.2.0
 A FastAPI service using a VLM-FIRST hybrid architecture for document parsing:
   Pass 1 (GPU):  Qwen3-VL via vLLM — concurrent OCR on ALL pages (fast)
   Pass 2 (CPU):  Docling TableFormer ONLY on table pages (targeted, minimal)
   Merge:         VLM text for all pages + TableFormer tables where detected
+v3.2.0 fixes over v3.0.0:
   - Quality: VLM prompt enforces markdown tables (no LaTeX), strips <think> tokens
   - Quality: VLM retry on timeout/failure (1 retry with longer timeout)
   - Quality: Table detection catches both markdown and LaTeX table patterns
     """Enhance image quality for better OCR accuracy.
     Applies CLAHE contrast enhancement only (fast).
+    Denoising was removed in v3.2.0 — it added ~10s/page with minimal
     benefit for VLM-based OCR which handles noise well.
     """
     img = cv2.imread(image_path)
 # Strip Qwen3 <think>...</think> reasoning blocks
 _THINK_PATTERN = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
+# Post-processing patterns for VLM output cleanup
+_CODE_FENCE_PATTERN = re.compile(r"^```(?:markdown|md|text)?\s*\n?", re.MULTILINE)
+_CODE_FENCE_END = re.compile(r"\n?```\s*$", re.MULTILINE)
+_HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
+_PAGE_N_PATTERN = re.compile(r"^\s*Page\s+\d+\s*$\n?", re.MULTILINE)
+def _clean_vlm_output(content: str) -> str:
+    """Post-process VLM output to clean artifacts.
+    Removes: code fences, HTML comments, 'Page N' artifacts,
+    and converts any remaining LaTeX tables to markdown format.
+    """
+    # Strip <think> blocks
+    content = _THINK_PATTERN.sub("", content).strip()
+    # Strip code fence wrappers
+    content = _CODE_FENCE_PATTERN.sub("", content)
+    content = _CODE_FENCE_END.sub("", content)
+    # Strip HTML comments (VLM sometimes adds coordinate annotations)
+    content = _HTML_COMMENT_PATTERN.sub("", content)
+    # Strip "Page N" artifacts
+    content = _PAGE_N_PATTERN.sub("", content)
+    # Convert LaTeX tables to markdown if VLM ignores the prompt
+    content = _convert_latex_tables_to_markdown(content)
+    return content.strip()
+def _convert_latex_tables_to_markdown(text: str) -> str:
+    """Convert LaTeX tabular environments to markdown pipe tables."""
+    latex_pattern = re.compile(
+        r"\\begin\{tabular\}\{[^}]*\}(.*?)\\end\{tabular\}", re.DOTALL
+    )
+    def _latex_to_md(match: re.Match) -> str:
+        body = match.group(1)
+        # Remove \hline
+        body = re.sub(r"\\hline\s*", "", body)
+        # Split on \\
+        rows = [r.strip() for r in re.split(r"\\\\", body) if r.strip()]
+        if not rows:
+            return match.group(0)
+        md_rows = []
+        for i, row in enumerate(rows):
+            cells = [c.strip() for c in row.split("&")]
+            md_row = "| " + " | ".join(cells) + " |"
+            md_rows.append(md_row)
+            if i == 0:
+                # Add separator after header
+                sep = "| " + " | ".join(["---"] * len(cells)) + " |"
+                md_rows.append(sep)
+        return "\n".join(md_rows)
+    return latex_pattern.sub(_latex_to_md, text)
 def _vlm_ocr_page(page_image_bytes: bytes, request_id: str = "", page_no: int = 0) -> str:
     """Send a page image to Qwen3-VL via vLLM for text extraction.
                     {
                         "type": "text",
                         "text": (
+                            "Convert this document page to markdown format.\n\n"
+                            "Rules:\n"
+                            "- Extract ALL text content exactly as written\n"
+                            "- Use ## headings for section titles\n"
+                            "- Preserve lists, paragraphs, and document structure\n"
+                            "- Format ALL tables as markdown tables with | delimiters and --- separator rows\n"
+                            "- NEVER use LaTeX (no \\begin{tabular}, no \\hline, no &)\n"
+                            "- NEVER wrap output in code fences (no ```)\n"
+                            "- NEVER add HTML comments or coordinate annotations\n"
+                            "- For handwritten text, transcribe as accurately as possible\n"
+                            "- Output ONLY the extracted markdown content, nothing else"
                         ),
                     },
                 ],
         ],
         "max_tokens": 16384,
         "temperature": 0.1,
+        # Disable Qwen3 thinking mode to avoid <think> tokens
+        "chat_template_kwargs": {"enable_thinking": False},
     }
     url = f"http://{VLM_HOST}:{VLM_PORT}/v1/chat/completions"
             if content is None:
                 raise ValueError("vLLM response missing content")
+            # Clean VLM output (strip think blocks, code fences, HTML comments, convert LaTeX tables)
+            content = _clean_vlm_output(content)
             return content
 async def lifespan(app: FastAPI):
     """Startup: initialize Docling converter and check vLLM."""
     logger.info("=" * 60)
+    logger.info("Starting Docling VLM Parser API v3.2.0...")
     device = _get_device()
     logger.info(f"Device: {device}")
 app = FastAPI(
     title="Docling VLM Parser API",
     description="VLM-first hybrid parser: Qwen3-VL OCR + targeted TableFormer tables",
+    version="3.2.0",
     lifespan=lifespan,
 )
     return HealthResponse(
         status="healthy",
+        version="3.2.0",
         device=device,
         gpu_name=None,
         vlm_model="active",