Spaces:

outcomelabs
/

docling-parser

Running on T4

App Files Files Community

Ibad ur Rehman commited on 30 days ago

Commit

b586eeb

1 Parent(s): 51c66dc

perf: optimize qwen inference path

Browse files

Files changed (3) hide show

app.py +8 -0
config.py +5 -1
pipeline.py +126 -42

app.py CHANGED Viewed

@@ -18,8 +18,12 @@ from config import (
     IMAGES_SCALE,
     MAX_FILE_SIZE_BYTES,
     MAX_FILE_SIZE_MB,
     QWEN_MAX_NEW_TOKENS,
     QWEN_MODEL,
     RENDER_DPI,
     logger,
 )
@@ -52,6 +56,10 @@ async def lifespan(app: FastAPI):
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Qwen Model: {QWEN_MODEL}")
     logger.info(f"Qwen Max New Tokens: {QWEN_MAX_NEW_TOKENS}")
     logger.info("=" * 60)
     logger.info("Docling VLM Parser API ready (Qwen3-VL local parser)")

     IMAGES_SCALE,
     MAX_FILE_SIZE_BYTES,
     MAX_FILE_SIZE_MB,
+    QWEN_ATTN_IMPLEMENTATION,
+    QWEN_BATCH_SIZE,
+    QWEN_IMAGE_MAX_SIDE,
     QWEN_MAX_NEW_TOKENS,
     QWEN_MODEL,
+    QWEN_TORCH_DTYPE,
     RENDER_DPI,
     logger,
 )
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Qwen Model: {QWEN_MODEL}")
     logger.info(f"Qwen Max New Tokens: {QWEN_MAX_NEW_TOKENS}")
+    logger.info(f"Qwen Batch Size: {QWEN_BATCH_SIZE}")
+    logger.info(f"Qwen Image Max Side: {QWEN_IMAGE_MAX_SIDE}")
+    logger.info(f"Qwen Attention: {QWEN_ATTN_IMPLEMENTATION}")
+    logger.info(f"Qwen Torch Dtype: {QWEN_TORCH_DTYPE}")
     logger.info("=" * 60)
     logger.info("Docling VLM Parser API ready (Qwen3-VL local parser)")

config.py CHANGED Viewed

@@ -21,7 +21,11 @@ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
 RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
 QWEN_MODEL = os.getenv("QWEN_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
-QWEN_MAX_NEW_TOKENS = int(os.getenv("QWEN_MAX_NEW_TOKENS", "4096"))
 # Blocked hostnames for SSRF protection
 BLOCKED_HOSTNAMES = {

 RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
 QWEN_MODEL = os.getenv("QWEN_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
+QWEN_MAX_NEW_TOKENS = int(os.getenv("QWEN_MAX_NEW_TOKENS", "1536"))
+QWEN_BATCH_SIZE = int(os.getenv("QWEN_BATCH_SIZE", "2"))
+QWEN_IMAGE_MAX_SIDE = int(os.getenv("QWEN_IMAGE_MAX_SIDE", "1536"))
+QWEN_ATTN_IMPLEMENTATION = os.getenv("QWEN_ATTN_IMPLEMENTATION", "flash_attention_2")
+QWEN_TORCH_DTYPE = os.getenv("QWEN_TORCH_DTYPE", "bfloat16")
 # Blocked hostnames for SSRF protection
 BLOCKED_HOSTNAMES = {

pipeline.py CHANGED Viewed

@@ -11,7 +11,15 @@ import torch
 from PIL import Image
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
-from config import QWEN_MAX_NEW_TOKENS, QWEN_MODEL, logger
 from postprocess import _post_process_merged_markdown
 from rendering import _image_file_to_png_bytes, _pdf_to_page_images
@@ -31,18 +39,48 @@ _OCR_PROMPT = (
 )
 def _get_pipeline() -> tuple[Qwen3VLForConditionalGeneration, AutoProcessor]:
     """Get or create the global Qwen3-VL pipeline."""
     global _model, _processor
     if _model is None or _processor is None:
         logger.info(f"Loading Qwen model: {QWEN_MODEL}")
         _processor = AutoProcessor.from_pretrained(QWEN_MODEL, trust_remote_code=True)
-        _model = Qwen3VLForConditionalGeneration.from_pretrained(
-            QWEN_MODEL,
-            torch_dtype="auto",
-            device_map="auto",
-            trust_remote_code=True,
-        )
         _model.eval()
     return _model, _processor
@@ -79,51 +117,96 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
     return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
-def _extract_markdown_from_image(
-    image_bytes: bytes,
-    page_label: str,
-) -> str:
-    """Run a single page image through Qwen3-VL."""
     model, processor = _get_pipeline()
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": _OCR_PROMPT},
-            ],
-        }
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
         return_tensors="pt",
     )
     device = next(model.parameters()).device
-    inputs = inputs.to(device)
     with torch.inference_mode():
         generated_ids = model.generate(
-            **inputs,
             max_new_tokens=QWEN_MAX_NEW_TOKENS,
             do_sample=False,
         )
-    prompt_length = inputs["input_ids"].shape[1]
-    output_ids = generated_ids[:, prompt_length:]
-    text = processor.batch_decode(
-        output_ids,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False,
-    )[0].strip()
-    logger.info(f"[{page_label}] Qwen generated {len(text)} chars")
-    return text
 def _collect_page_images(
@@ -159,10 +242,11 @@ def _convert_document(
         raise ValueError("No pages available to parse")
     markdown_pages: list[str] = []
-    for page_idx, image_bytes in page_images:
-        page_label = f"{request_id}:page:{page_idx + 1}"
-        text = _extract_markdown_from_image(image_bytes, page_label)
-        markdown_pages.append(text)
     markdown_content = "\n\n".join(p for p in markdown_pages if p).strip()
     markdown_content = _post_process_merged_markdown(markdown_content)

 from PIL import Image
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+from config import (
+    QWEN_ATTN_IMPLEMENTATION,
+    QWEN_BATCH_SIZE,
+    QWEN_IMAGE_MAX_SIDE,
+    QWEN_MAX_NEW_TOKENS,
+    QWEN_MODEL,
+    QWEN_TORCH_DTYPE,
+    logger,
+)
 from postprocess import _post_process_merged_markdown
 from rendering import _image_file_to_png_bytes, _pdf_to_page_images
 )
+def _resolve_torch_dtype() -> torch.dtype | str:
+    """Resolve configured dtype to a torch dtype when possible."""
+    dtype_map = {
+        "auto": "auto",
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+        "float32": torch.float32,
+    }
+    return dtype_map.get(QWEN_TORCH_DTYPE.lower(), "auto")
 def _get_pipeline() -> tuple[Qwen3VLForConditionalGeneration, AutoProcessor]:
     """Get or create the global Qwen3-VL pipeline."""
     global _model, _processor
     if _model is None or _processor is None:
         logger.info(f"Loading Qwen model: {QWEN_MODEL}")
         _processor = AutoProcessor.from_pretrained(QWEN_MODEL, trust_remote_code=True)
+        model_kwargs = {
+            "torch_dtype": _resolve_torch_dtype(),
+            "device_map": "auto",
+            "trust_remote_code": True,
+        }
+        if QWEN_ATTN_IMPLEMENTATION and QWEN_ATTN_IMPLEMENTATION.lower() != "none":
+            model_kwargs["attn_implementation"] = QWEN_ATTN_IMPLEMENTATION
+        try:
+            _model = Qwen3VLForConditionalGeneration.from_pretrained(
+                QWEN_MODEL,
+                **model_kwargs,
+            )
+        except Exception as e:
+            if "attn_implementation" in model_kwargs:
+                logger.warning(
+                    f"Failed to load Qwen with attn_implementation={QWEN_ATTN_IMPLEMENTATION}: {e}. "
+                    "Retrying without custom attention."
+                )
+                model_kwargs.pop("attn_implementation", None)
+                _model = Qwen3VLForConditionalGeneration.from_pretrained(
+                    QWEN_MODEL,
+                    **model_kwargs,
+                )
+            else:
+                raise
         _model.eval()
     return _model, _processor
     return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
+def _resize_image(image: Image.Image) -> Image.Image:
+    """Downscale images to reduce visual token count and generation latency."""
+    max_side = max(image.size)
+    if max_side <= QWEN_IMAGE_MAX_SIDE:
+        return image
+    scale = QWEN_IMAGE_MAX_SIDE / max_side
+    new_size = (
+        max(1, int(image.size[0] * scale)),
+        max(1, int(image.size[1] * scale)),
+    )
+    return image.resize(new_size, Image.Resampling.LANCZOS)
+def _extract_markdown_from_images(
+    page_images: list[tuple[int, bytes]],
+    request_id: str,
+) -> dict[int, str]:
+    """Run a batch of page images through Qwen3-VL."""
     model, processor = _get_pipeline()
+    prompt_texts: list[str] = []
+    images: list[Image.Image] = []
+    page_indices: list[int] = []
+    for page_idx, image_bytes in page_images:
+        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        image = _resize_image(image)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": _OCR_PROMPT},
+                ],
+            }
+        ]
+        prompt_texts.append(
+            processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+        )
+        images.append(image)
+        page_indices.append(page_idx)
+    inputs = processor(
+        text=prompt_texts,
+        images=images,
+        padding=True,
         return_tensors="pt",
     )
     device = next(model.parameters()).device
+    model_inputs = {
+        key: value.to(device) if hasattr(value, "to") else value
+        for key, value in inputs.items()
+    }
     with torch.inference_mode():
         generated_ids = model.generate(
+            **model_inputs,
             max_new_tokens=QWEN_MAX_NEW_TOKENS,
             do_sample=False,
         )
+    input_lengths = model_inputs["attention_mask"].sum(dim=1).tolist()
+    decoded_pages: dict[int, str] = {}
+    for row_idx, prompt_length in enumerate(input_lengths):
+        output_ids = generated_ids[row_idx : row_idx + 1, int(prompt_length) :]
+        text = processor.batch_decode(
+            output_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )[0].strip()
+        page_idx = page_indices[row_idx]
+        decoded_pages[page_idx] = text
+        logger.info(f"[{request_id}:page:{page_idx + 1}] Qwen generated {len(text)} chars")
+    return decoded_pages
+def _extract_markdown_from_image(
+    image_bytes: bytes,
+    page_label: str,
+) -> str:
+    """Backwards-compatible single-image wrapper."""
+    page_idx = 0
+    page_map = _extract_markdown_from_images([(page_idx, image_bytes)], page_label)
+    return page_map[page_idx]
 def _collect_page_images(
         raise ValueError("No pages available to parse")
     markdown_pages: list[str] = []
+    for batch_start in range(0, len(page_images), QWEN_BATCH_SIZE):
+        batch = page_images[batch_start : batch_start + QWEN_BATCH_SIZE]
+        batch_outputs = _extract_markdown_from_images(batch, request_id)
+        for page_idx, _ in batch:
+            markdown_pages.append(batch_outputs.get(page_idx, ""))
     markdown_content = "\n\n".join(p for p in markdown_pages if p).strip()
     markdown_content = _post_process_merged_markdown(markdown_content)