Spaces:

outcomelabs
/

docling-parser

Running on T4

App Files Files Community

Ibad ur Rehman commited on 30 days ago

Commit

dd23733

1 Parent(s): b586eeb

feat: switch to unsloth gguf runtime

Browse files

Files changed (6) hide show

Dockerfile +16 -11
app.py +70 -124
config.py +12 -11
pipeline.py +45 -154
requirements.txt +1 -6
start.sh +12 -4

Dockerfile CHANGED Viewed

@@ -1,40 +1,45 @@
-# Hugging Face Spaces Dockerfile for Qwen3-VL parser API
-# v5.1.0 - Qwen3-VL-8B-Instruct local inference
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-runtime
 USER root
-# Install fonts and PDF utilities for document parsing
 RUN apt-get update && apt-get install -y --no-install-recommends \
     fonts-noto-core fonts-noto-cjk fontconfig \
-    libgl1 libglib2.0-0 poppler-utils curl git \
     && fc-cache -fv && rm -rf /var/lib/apt/lists/*
-# Create non-root user for HF Spaces
 RUN useradd -m -u 1000 user
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     IMAGES_SCALE=2.0 \
     MAX_FILE_SIZE_MB=1024 \
     HF_HOME=/home/user/.cache/huggingface \
     XDG_CACHE_HOME=/home/user/.cache \
     HOME=/home/user \
     PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
-RUN mkdir -p /home/user/.cache/huggingface /home/user/.cache/paddleocr /home/user/app \
     && chown -R user:user /home/user
 USER user
-WORKDIR /home/user/app
-COPY --chown=user:user requirements.txt .
-RUN pip install --user --upgrade pip && pip install --user -r requirements.txt
 COPY --chown=user:user . .
 RUN chmod +x start.sh
 EXPOSE 7860

+# Hugging Face Spaces Dockerfile for Unsloth GGUF Qwen3-VL parser API
+# v5.2.0 - llama.cpp + Unsloth Qwen3-VL-8B-Instruct GGUF
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
 USER root
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 python3-pip python3-venv \
+    build-essential cmake git \
     fonts-noto-core fonts-noto-cjk fontconfig \
+    libgl1 libglib2.0-0 poppler-utils curl \
     && fc-cache -fv && rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 ENV PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     IMAGES_SCALE=2.0 \
     MAX_FILE_SIZE_MB=1024 \
+    LLAMA_SERVER_URL=http://127.0.0.1:8080 \
     HF_HOME=/home/user/.cache/huggingface \
     XDG_CACHE_HOME=/home/user/.cache \
     HOME=/home/user \
     PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:$PATH
+RUN mkdir -p /home/user/.cache/huggingface /home/user/app \
     && chown -R user:user /home/user
 USER user
+WORKDIR /home/user
+RUN git clone --depth 1 https://github.com/ggml-org/llama.cpp /home/user/llama.cpp
+WORKDIR /home/user/llama.cpp
+RUN cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON
+RUN cmake --build build -j
+WORKDIR /home/user/app
+COPY --chown=user:user requirements.txt .
+RUN python3 -m pip install --user --upgrade pip && python3 -m pip install --user -r requirements.txt
 COPY --chown=user:user . .
 RUN chmod +x start.sh
 EXPOSE 7860

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Qwen3-VL parser API."""
 import asyncio
 import re
@@ -16,14 +16,19 @@ from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile
 from auth import _validate_url, verify_token
 from config import (
     IMAGES_SCALE,
     MAX_FILE_SIZE_BYTES,
     MAX_FILE_SIZE_MB,
-    QWEN_ATTN_IMPLEMENTATION,
-    QWEN_BATCH_SIZE,
-    QWEN_IMAGE_MAX_SIDE,
-    QWEN_MAX_NEW_TOKENS,
     QWEN_MODEL,
-    QWEN_TORCH_DTYPE,
     RENDER_DPI,
     logger,
 )
@@ -37,61 +42,52 @@ from pipeline import (
 )
-# ---------------------------------------------------------------------------
-# Application Lifespan
-# ---------------------------------------------------------------------------
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Startup: initialize Qwen3-VL pipeline."""
     logger.info("=" * 60)
-    logger.info("Starting Docling VLM Parser API v5.1.0...")
-    logger.info("Initializing Qwen3-VL pipeline...")
     _get_pipeline()
-    logger.info("Qwen3-VL ready")
     logger.info(f"Render DPI: {RENDER_DPI}")
     logger.info(f"Images scale: {IMAGES_SCALE}")
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Qwen Model: {QWEN_MODEL}")
-    logger.info(f"Qwen Max New Tokens: {QWEN_MAX_NEW_TOKENS}")
-    logger.info(f"Qwen Batch Size: {QWEN_BATCH_SIZE}")
-    logger.info(f"Qwen Image Max Side: {QWEN_IMAGE_MAX_SIDE}")
-    logger.info(f"Qwen Attention: {QWEN_ATTN_IMPLEMENTATION}")
-    logger.info(f"Qwen Torch Dtype: {QWEN_TORCH_DTYPE}")
     logger.info("=" * 60)
-    logger.info("Docling VLM Parser API ready (Qwen3-VL local parser)")
     logger.info("=" * 60)
     yield
-    logger.info("Shutting down Docling VLM Parser API...")
-# ---------------------------------------------------------------------------
-# FastAPI App
-# ---------------------------------------------------------------------------
 app = FastAPI(
-    title="Docling VLM Parser API",
-    description="Qwen3-VL local parser",
-    version="5.1.0",
     lifespan=lifespan,
 )
-# ---------------------------------------------------------------------------
-# Endpoints
-# ---------------------------------------------------------------------------
 @app.get("/", response_model=HealthResponse)
 async def health_check() -> HealthResponse:
     """Health check endpoint."""
     return HealthResponse(
         status="healthy",
-        version="5.1.0",
-        model="Qwen3-VL-8B-Instruct",
         gemini_status="not used",
         images_scale=IMAGES_SCALE,
     )
@@ -100,34 +96,33 @@ async def health_check() -> HealthResponse:
 @app.post("/parse", response_model=ParseResponse)
 async def parse_document(
     file: UploadFile = File(..., description="PDF or image file to parse"),
-    output_format: str = Form(default="markdown", description="Output format: markdown or json"),
-    images_scale: Optional[float] = Form(default=None, description="Image resolution scale"),
     start_page: int = Form(default=0, description="Starting page (0-indexed)"),
     end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"),
     include_images: bool = Form(default=False, description="Include extracted images"),
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
-    """Parse a document file using Qwen3-VL."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
-    logger.info(f"[{request_id}] {'='*50}")
     logger.info(f"[{request_id}] New parse request received")
-    safe_filename = re.sub(r'[\r\n\t\x00-\x1f\x7f]', '_', file.filename or "")[:255]
     logger.info(f"[{request_id}] Filename: {safe_filename}")
     logger.info(f"[{request_id}] Output format: {output_format}")
-    if output_format not in ("markdown",):
-        raise HTTPException(
-            status_code=400,
-            detail="Only 'markdown' output_format is supported",
-        )
-    # Validate file size
     file.file.seek(0, 2)
     file_size = file.file.tell()
     file.file.seek(0)
     file_size_mb = file_size / (1024 * 1024)
     logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
@@ -137,20 +132,18 @@ async def parse_document(
             detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
         )
-    # Validate file type
     allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
     file_ext = Path(file.filename).suffix.lower() if file.filename else ""
     if file_ext not in allowed_extensions:
         raise HTTPException(
             status_code=400,
-            detail=f"Unsupported file type. Allowed: {', '.join(allowed_extensions)}",
         )
     logger.info(f"[{request_id}] Model: {QWEN_MODEL}")
-    logger.info(f"[{request_id}] Page range: {start_page} to {end_page or 'end'}")
     temp_dir = tempfile.mkdtemp()
     try:
         input_path = Path(temp_dir) / f"input{file_ext}"
         await asyncio.to_thread(_save_uploaded_file, input_path, file.file)
@@ -173,35 +166,22 @@ async def parse_document(
             images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
-        logger.info(f"[{request_id}] {'='*50}")
-        logger.info(f"[{request_id}] Request completed successfully")
-        logger.info(f"[{request_id}] Pages processed: {pages_processed}")
-        logger.info(f"[{request_id}] Total time: {total_duration:.2f}s")
-        if pages_processed > 0:
-            logger.info(f"[{request_id}] Speed: {pages_processed / total_duration:.2f} pages/sec")
-        logger.info(f"[{request_id}] {'='*50}")
         return ParseResponse(
             success=True,
-            markdown=markdown_content if output_format == "markdown" else None,
-            json_content=json_content if output_format == "json" else None,
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="gpu",
             vlm_model=QWEN_MODEL,
         )
     except Exception as e:
         total_duration = time.time() - start_time
-        logger.error(f"[{request_id}] {'='*50}")
-        logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s")
-        logger.error(f"[{request_id}] Error: {type(e).__name__}: {str(e)}", exc_info=True)
-        logger.error(f"[{request_id}] {'='*50}")
-        return ParseResponse(
-            success=False,
-            error=f"Processing failed (ref: {request_id})",
-        )
     finally:
         shutil.rmtree(temp_dir, ignore_errors=True)
@@ -211,43 +191,37 @@ async def parse_document_from_url(
     request: URLParseRequest,
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
-    """Parse a document from a URL using Qwen3-VL."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
-    logger.info(f"[{request_id}] {'='*50}")
     logger.info(f"[{request_id}] New URL parse request received")
     logger.info(f"[{request_id}] URL: {request.url}")
-    logger.info(f"[{request_id}] Output format: {request.output_format}")
-    if request.output_format not in ("markdown",):
-        raise HTTPException(
-            status_code=400,
-            detail="Only 'markdown' output_format is supported",
-        )
     _validate_url(request.url)
     temp_dir = tempfile.mkdtemp()
     try:
-        # Download file
-        logger.info(f"[{request_id}] Downloading file from URL...")
-        download_start = time.time()
         async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
             response = await client.get(request.url)
             response.raise_for_status()
-        file_size_mb = len(response.content) / (1024 * 1024)
-        logger.info(
-            f"[{request_id}] Download completed in {time.time() - download_start:.2f}s "
-            f"({file_size_mb:.2f} MB)"
-        )
-        # Determine file extension (with Content-Type fallback)
         url_path = Path(request.url.split("?")[0])
         file_ext = url_path.suffix.lower()
         if not file_ext or file_ext not in {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
             content_type = response.headers.get("content-type", "").lower()
             ct_map = {
@@ -259,23 +233,12 @@ async def parse_document_from_url(
             }
             file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
-        if len(response.content) > MAX_FILE_SIZE_BYTES:
-            raise HTTPException(
-                status_code=413,
-                detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
-            )
         input_path = Path(temp_dir) / f"input{file_ext}"
         await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
-        logger.info(f"[{request_id}] Model: {QWEN_MODEL}")
-        logger.info(
-            f"[{request_id}] Page range: {request.start_page} to {request.end_page or 'end'}"
-        )
         markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
             _convert_document,
             input_path,
@@ -291,42 +254,25 @@ async def parse_document_from_url(
             images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
-        logger.info(f"[{request_id}] {'='*50}")
-        logger.info(f"[{request_id}] Request completed successfully")
-        logger.info(f"[{request_id}] Pages processed: {pages_processed}")
-        logger.info(f"[{request_id}] Total time: {total_duration:.2f}s")
-        if pages_processed > 0:
-            logger.info(f"[{request_id}] Speed: {pages_processed / total_duration:.2f} pages/sec")
-        logger.info(f"[{request_id}] {'='*50}")
         return ParseResponse(
             success=True,
-            markdown=markdown_content if request.output_format == "markdown" else None,
-            json_content=json_content if request.output_format == "json" else None,
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="gpu",
             vlm_model=QWEN_MODEL,
         )
     except httpx.HTTPError as e:
-        total_duration = time.time() - start_time
-        logger.error(f"[{request_id}] Download failed after {total_duration:.2f}s: {str(e)}")
-        return ParseResponse(
-            success=False,
-            error=f"Failed to download file from URL (ref: {request_id})",
-        )
     except Exception as e:
         total_duration = time.time() - start_time
-        logger.error(f"[{request_id}] {'='*50}")
-        logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s")
-        logger.error(f"[{request_id}] Error: {type(e).__name__}: {str(e)}", exc_info=True)
-        logger.error(f"[{request_id}] {'='*50}")
-        return ParseResponse(
-            success=False,
-            error=f"Processing failed (ref: {request_id})",
-        )
     finally:
         shutil.rmtree(temp_dir, ignore_errors=True)

+"""Unsloth Qwen3-VL GGUF parser API."""
 import asyncio
 import re
 from auth import _validate_url, verify_token
 from config import (
     IMAGES_SCALE,
+    LLAMA_CTX_SIZE,
+    LLAMA_FLASH_ATTN,
+    LLAMA_GPU_LAYERS,
+    LLAMA_HF_FILE,
+    LLAMA_HF_REPO,
+    LLAMA_MAX_TOKENS,
+    LLAMA_MMPROJ_FILE,
+    LLAMA_SERVER_TIMEOUT,
+    LLAMA_SERVER_URL,
+    LLAMA_THREADS,
     MAX_FILE_SIZE_BYTES,
     MAX_FILE_SIZE_MB,
     QWEN_MODEL,
     RENDER_DPI,
     logger,
 )
 )
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Startup: initialize local llama.cpp client."""
     logger.info("=" * 60)
+    logger.info("Starting Docling Parser API v5.2.0...")
+    logger.info("Initializing local llama.cpp client...")
     _get_pipeline()
+    logger.info("llama.cpp client ready")
     logger.info(f"Render DPI: {RENDER_DPI}")
     logger.info(f"Images scale: {IMAGES_SCALE}")
     logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB")
     logger.info(f"Qwen Model: {QWEN_MODEL}")
+    logger.info(f"llama-server URL: {LLAMA_SERVER_URL}")
+    logger.info(f"llama-server timeout: {LLAMA_SERVER_TIMEOUT}s")
+    logger.info(f"llama HF repo: {LLAMA_HF_REPO}")
+    logger.info(f"llama model file: {LLAMA_HF_FILE}")
+    logger.info(f"llama mmproj file: {LLAMA_MMPROJ_FILE}")
+    logger.info(f"llama max tokens: {LLAMA_MAX_TOKENS}")
+    logger.info(f"llama ctx size: {LLAMA_CTX_SIZE}")
+    logger.info(f"llama gpu layers: {LLAMA_GPU_LAYERS}")
+    logger.info(f"llama threads: {LLAMA_THREADS}")
+    logger.info(f"llama flash attention: {LLAMA_FLASH_ATTN}")
     logger.info("=" * 60)
+    logger.info("Docling Parser API ready (Unsloth GGUF via llama.cpp)")
     logger.info("=" * 60)
     yield
+    logger.info("Shutting down Docling Parser API...")
 app = FastAPI(
+    title="Docling Parser API",
+    description="Unsloth Qwen3-VL GGUF local parser",
+    version="5.2.0",
     lifespan=lifespan,
 )
 @app.get("/", response_model=HealthResponse)
 async def health_check() -> HealthResponse:
     """Health check endpoint."""
     return HealthResponse(
         status="healthy",
+        version="5.2.0",
+        model="Qwen3-VL-8B-Instruct GGUF",
         gemini_status="not used",
         images_scale=IMAGES_SCALE,
     )
 @app.post("/parse", response_model=ParseResponse)
 async def parse_document(
     file: UploadFile = File(..., description="PDF or image file to parse"),
+    output_format: str = Form(default="markdown", description="Output format: markdown only"),
+    images_scale: Optional[float] = Form(default=None, description="Reserved for compatibility"),
     start_page: int = Form(default=0, description="Starting page (0-indexed)"),
     end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"),
     include_images: bool = Form(default=False, description="Include extracted images"),
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
+    """Parse a document file using local llama.cpp + Unsloth GGUF."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
+    logger.info(f"[{request_id}] {'=' * 50}")
     logger.info(f"[{request_id}] New parse request received")
+    safe_filename = re.sub(r"[\r\n\t\x00-\x1f\x7f]", "_", file.filename or "")[:255]
     logger.info(f"[{request_id}] Filename: {safe_filename}")
     logger.info(f"[{request_id}] Output format: {output_format}")
+    if output_format != "markdown":
+        raise HTTPException(status_code=400, detail="Only 'markdown' output_format is supported")
+    if start_page < 0:
+        raise HTTPException(status_code=400, detail="start_page must be >= 0")
+    if end_page is not None and end_page < start_page:
+        raise HTTPException(status_code=400, detail="end_page must be >= start_page")
     file.file.seek(0, 2)
     file_size = file.file.tell()
     file.file.seek(0)
     file_size_mb = file_size / (1024 * 1024)
     logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB")
             detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
         )
     allowed_extensions = {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}
     file_ext = Path(file.filename).suffix.lower() if file.filename else ""
     if file_ext not in allowed_extensions:
         raise HTTPException(
             status_code=400,
+            detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
         )
     logger.info(f"[{request_id}] Model: {QWEN_MODEL}")
+    logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}")
     temp_dir = tempfile.mkdtemp()
     try:
         input_path = Path(temp_dir) / f"input{file_ext}"
         await asyncio.to_thread(_save_uploaded_file, input_path, file.file)
             images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
+        logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s")
         return ParseResponse(
             success=True,
+            markdown=markdown_content,
+            json_content=json_content,
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="gpu",
             vlm_model=QWEN_MODEL,
         )
     except Exception as e:
         total_duration = time.time() - start_time
+        logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
+        return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
     finally:
         shutil.rmtree(temp_dir, ignore_errors=True)
     request: URLParseRequest,
     _token: str = Depends(verify_token),
 ) -> ParseResponse:
+    """Parse a document from URL using local llama.cpp + Unsloth GGUF."""
     request_id = str(uuid4())[:8]
     start_time = time.time()
+    logger.info(f"[{request_id}] {'=' * 50}")
     logger.info(f"[{request_id}] New URL parse request received")
     logger.info(f"[{request_id}] URL: {request.url}")
+    if request.output_format != "markdown":
+        raise HTTPException(status_code=400, detail="Only 'markdown' output_format is supported")
+    if request.start_page < 0:
+        raise HTTPException(status_code=400, detail="start_page must be >= 0")
+    if request.end_page is not None and request.end_page < request.start_page:
+        raise HTTPException(status_code=400, detail="end_page must be >= start_page")
     _validate_url(request.url)
     temp_dir = tempfile.mkdtemp()
     try:
         async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
             response = await client.get(request.url)
             response.raise_for_status()
+        if len(response.content) > MAX_FILE_SIZE_BYTES:
+            raise HTTPException(
+                status_code=413,
+                detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
+            )
         url_path = Path(request.url.split("?")[0])
         file_ext = url_path.suffix.lower()
         if not file_ext or file_ext not in {".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
             content_type = response.headers.get("content-type", "").lower()
             ct_map = {
             }
             file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
         input_path = Path(temp_dir) / f"input{file_ext}"
         await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
         markdown_content, json_content, pages_processed, image_count = await asyncio.to_thread(
             _convert_document,
             input_path,
             images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
+        logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s")
         return ParseResponse(
             success=True,
+            markdown=markdown_content,
+            json_content=json_content,
             images_zip=images_zip,
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="gpu",
             vlm_model=QWEN_MODEL,
         )
     except httpx.HTTPError as e:
+        logger.error(f"[{request_id}] Download failed: {e}")
+        return ParseResponse(success=False, error=f"Failed to download file from URL (ref: {request_id})")
     except Exception as e:
         total_duration = time.time() - start_time
+        logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True)
+        return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})")
     finally:
         shutil.rmtree(temp_dir, ignore_errors=True)

config.py CHANGED Viewed

@@ -1,9 +1,8 @@
-"""Configuration, environment variables, and logging setup for the Qwen parser."""
 import logging
 import os
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s | %(levelname)-8s | %(message)s",
@@ -11,23 +10,25 @@ logging.basicConfig(
 )
 logger = logging.getLogger("docling-parser")
-# Security
 API_TOKEN = os.getenv("API_TOKEN")
-# Configuration
 IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
 MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
 MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
 RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
-QWEN_MODEL = os.getenv("QWEN_MODEL", "Qwen/Qwen3-VL-8B-Instruct")
-QWEN_MAX_NEW_TOKENS = int(os.getenv("QWEN_MAX_NEW_TOKENS", "1536"))
-QWEN_BATCH_SIZE = int(os.getenv("QWEN_BATCH_SIZE", "2"))
-QWEN_IMAGE_MAX_SIDE = int(os.getenv("QWEN_IMAGE_MAX_SIDE", "1536"))
-QWEN_ATTN_IMPLEMENTATION = os.getenv("QWEN_ATTN_IMPLEMENTATION", "flash_attention_2")
-QWEN_TORCH_DTYPE = os.getenv("QWEN_TORCH_DTYPE", "bfloat16")
-# Blocked hostnames for SSRF protection
 BLOCKED_HOSTNAMES = {
     "localhost",
     "metadata",

+"""Configuration, environment variables, and logging setup for the Unsloth Qwen parser."""
 import logging
 import os
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s | %(levelname)-8s | %(message)s",
 )
 logger = logging.getLogger("docling-parser")
 API_TOKEN = os.getenv("API_TOKEN")
 IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
 MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
 MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
 RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
+QWEN_MODEL = os.getenv("QWEN_MODEL", "unsloth/Qwen3-VL-8B-Instruct-GGUF")
+LLAMA_SERVER_URL = os.getenv("LLAMA_SERVER_URL", "http://127.0.0.1:8080")
+LLAMA_SERVER_TIMEOUT = float(os.getenv("LLAMA_SERVER_TIMEOUT", "300"))
+LLAMA_MAX_TOKENS = int(os.getenv("LLAMA_MAX_TOKENS", "1536"))
+LLAMA_HF_REPO = os.getenv("LLAMA_HF_REPO", "unsloth/Qwen3-VL-8B-Instruct-GGUF")
+LLAMA_HF_FILE = os.getenv("LLAMA_HF_FILE", "Qwen3-VL-8B-Instruct-UD-Q4_K_XL.gguf")
+LLAMA_MMPROJ_FILE = os.getenv("LLAMA_MMPROJ_FILE", "mmproj-F16.gguf")
+LLAMA_CTX_SIZE = int(os.getenv("LLAMA_CTX_SIZE", "8192"))
+LLAMA_GPU_LAYERS = int(os.getenv("LLAMA_GPU_LAYERS", "99"))
+LLAMA_THREADS = int(os.getenv("LLAMA_THREADS", "8"))
+LLAMA_FLASH_ATTN = os.getenv("LLAMA_FLASH_ATTN", "on")
 BLOCKED_HOSTNAMES = {
     "localhost",
     "metadata",

pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Qwen3-VL pipeline, page rendering, and file helpers."""
 import base64
 import io
@@ -7,25 +7,18 @@ import zipfile
 from pathlib import Path
 from typing import BinaryIO, Optional
-import torch
-from PIL import Image
-from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 from config import (
-    QWEN_ATTN_IMPLEMENTATION,
-    QWEN_BATCH_SIZE,
-    QWEN_IMAGE_MAX_SIDE,
-    QWEN_MAX_NEW_TOKENS,
     QWEN_MODEL,
-    QWEN_TORCH_DTYPE,
     logger,
 )
 from postprocess import _post_process_merged_markdown
 from rendering import _image_file_to_png_bytes, _pdf_to_page_images
-_model = None
-_processor = None
 _OCR_PROMPT = (
     "Convert this document page to clean markdown.\n\n"
     "Rules:\n"
@@ -39,50 +32,9 @@ _OCR_PROMPT = (
 )
-def _resolve_torch_dtype() -> torch.dtype | str:
-    """Resolve configured dtype to a torch dtype when possible."""
-    dtype_map = {
-        "auto": "auto",
-        "bfloat16": torch.bfloat16,
-        "float16": torch.float16,
-        "float32": torch.float32,
-    }
-    return dtype_map.get(QWEN_TORCH_DTYPE.lower(), "auto")
-def _get_pipeline() -> tuple[Qwen3VLForConditionalGeneration, AutoProcessor]:
-    """Get or create the global Qwen3-VL pipeline."""
-    global _model, _processor
-    if _model is None or _processor is None:
-        logger.info(f"Loading Qwen model: {QWEN_MODEL}")
-        _processor = AutoProcessor.from_pretrained(QWEN_MODEL, trust_remote_code=True)
-        model_kwargs = {
-            "torch_dtype": _resolve_torch_dtype(),
-            "device_map": "auto",
-            "trust_remote_code": True,
-        }
-        if QWEN_ATTN_IMPLEMENTATION and QWEN_ATTN_IMPLEMENTATION.lower() != "none":
-            model_kwargs["attn_implementation"] = QWEN_ATTN_IMPLEMENTATION
-        try:
-            _model = Qwen3VLForConditionalGeneration.from_pretrained(
-                QWEN_MODEL,
-                **model_kwargs,
-            )
-        except Exception as e:
-            if "attn_implementation" in model_kwargs:
-                logger.warning(
-                    f"Failed to load Qwen with attn_implementation={QWEN_ATTN_IMPLEMENTATION}: {e}. "
-                    "Retrying without custom attention."
-                )
-                model_kwargs.pop("attn_implementation", None)
-                _model = Qwen3VLForConditionalGeneration.from_pretrained(
-                    QWEN_MODEL,
-                    **model_kwargs,
-                )
-            else:
-                raise
-        _model.eval()
-    return _model, _processor
 def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
@@ -117,98 +69,6 @@ def _create_images_zip(output_dir: Path) -> tuple[Optional[str], int]:
     return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
-def _resize_image(image: Image.Image) -> Image.Image:
-    """Downscale images to reduce visual token count and generation latency."""
-    max_side = max(image.size)
-    if max_side <= QWEN_IMAGE_MAX_SIDE:
-        return image
-    scale = QWEN_IMAGE_MAX_SIDE / max_side
-    new_size = (
-        max(1, int(image.size[0] * scale)),
-        max(1, int(image.size[1] * scale)),
-    )
-    return image.resize(new_size, Image.Resampling.LANCZOS)
-def _extract_markdown_from_images(
-    page_images: list[tuple[int, bytes]],
-    request_id: str,
-) -> dict[int, str]:
-    """Run a batch of page images through Qwen3-VL."""
-    model, processor = _get_pipeline()
-    prompt_texts: list[str] = []
-    images: list[Image.Image] = []
-    page_indices: list[int] = []
-    for page_idx, image_bytes in page_images:
-        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        image = _resize_image(image)
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": _OCR_PROMPT},
-                ],
-            }
-        ]
-        prompt_texts.append(
-            processor.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
-            )
-        )
-        images.append(image)
-        page_indices.append(page_idx)
-    inputs = processor(
-        text=prompt_texts,
-        images=images,
-        padding=True,
-        return_tensors="pt",
-    )
-    device = next(model.parameters()).device
-    model_inputs = {
-        key: value.to(device) if hasattr(value, "to") else value
-        for key, value in inputs.items()
-    }
-    with torch.inference_mode():
-        generated_ids = model.generate(
-            **model_inputs,
-            max_new_tokens=QWEN_MAX_NEW_TOKENS,
-            do_sample=False,
-        )
-    input_lengths = model_inputs["attention_mask"].sum(dim=1).tolist()
-    decoded_pages: dict[int, str] = {}
-    for row_idx, prompt_length in enumerate(input_lengths):
-        output_ids = generated_ids[row_idx : row_idx + 1, int(prompt_length) :]
-        text = processor.batch_decode(
-            output_ids,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )[0].strip()
-        page_idx = page_indices[row_idx]
-        decoded_pages[page_idx] = text
-        logger.info(f"[{request_id}:page:{page_idx + 1}] Qwen generated {len(text)} chars")
-    return decoded_pages
-def _extract_markdown_from_image(
-    image_bytes: bytes,
-    page_label: str,
-) -> str:
-    """Backwards-compatible single-image wrapper."""
-    page_idx = 0
-    page_map = _extract_markdown_from_images([(page_idx, image_bytes)], page_label)
-    return page_map[page_idx]
 def _collect_page_images(
     input_path: Path,
     request_id: str,
@@ -228,6 +88,39 @@ def _collect_page_images(
     return [(0, _image_file_to_png_bytes(input_path))]
 def _convert_document(
     input_path: Path,
     output_dir: Path,
@@ -236,17 +129,15 @@ def _convert_document(
     start_page: int = 0,
     end_page: Optional[int] = None,
 ) -> tuple:
-    """Render pages and parse them with Qwen3-VL."""
     page_images = _collect_page_images(input_path, request_id, start_page, end_page)
     if not page_images:
         raise ValueError("No pages available to parse")
     markdown_pages: list[str] = []
-    for batch_start in range(0, len(page_images), QWEN_BATCH_SIZE):
-        batch = page_images[batch_start : batch_start + QWEN_BATCH_SIZE]
-        batch_outputs = _extract_markdown_from_images(batch, request_id)
-        for page_idx, _ in batch:
-            markdown_pages.append(batch_outputs.get(page_idx, ""))
     markdown_content = "\n\n".join(p for p in markdown_pages if p).strip()
     markdown_content = _post_process_merged_markdown(markdown_content)

+"""Unsloth GGUF Qwen3-VL pipeline and file helpers."""
 import base64
 import io
 from pathlib import Path
 from typing import BinaryIO, Optional
+import httpx
 from config import (
+    LLAMA_MAX_TOKENS,
+    LLAMA_SERVER_TIMEOUT,
+    LLAMA_SERVER_URL,
     QWEN_MODEL,
     logger,
 )
 from postprocess import _post_process_merged_markdown
 from rendering import _image_file_to_png_bytes, _pdf_to_page_images
 _OCR_PROMPT = (
     "Convert this document page to clean markdown.\n\n"
     "Rules:\n"
 )
+def _get_pipeline() -> str:
+    """Compatibility helper for app startup."""
+    return LLAMA_SERVER_URL
 def _save_uploaded_file(input_path: Path, file_obj: BinaryIO) -> None:
     return base64.b64encode(zip_buffer.getvalue()).decode("utf-8"), image_count
 def _collect_page_images(
     input_path: Path,
     request_id: str,
     return [(0, _image_file_to_png_bytes(input_path))]
+def _call_llama_server(image_bytes: bytes, page_label: str) -> str:
+    """Send a page image to the local llama.cpp OpenAI-compatible server."""
+    image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+    payload = {
+        "model": QWEN_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": _OCR_PROMPT},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                    },
+                ],
+            }
+        ],
+        "temperature": 0.0,
+        "max_tokens": LLAMA_MAX_TOKENS,
+    }
+    response = httpx.post(
+        f"{LLAMA_SERVER_URL}/v1/chat/completions",
+        json=payload,
+        timeout=LLAMA_SERVER_TIMEOUT,
+    )
+    response.raise_for_status()
+    data = response.json()
+    text = data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
+    logger.info(f"[{page_label}] llama-server generated {len(text)} chars")
+    return text
 def _convert_document(
     input_path: Path,
     output_dir: Path,
     start_page: int = 0,
     end_page: Optional[int] = None,
 ) -> tuple:
+    """Render pages and parse them with the local Unsloth GGUF server."""
     page_images = _collect_page_images(input_path, request_id, start_page, end_page)
     if not page_images:
         raise ValueError("No pages available to parse")
     markdown_pages: list[str] = []
+    for page_idx, image_bytes in page_images:
+        page_label = f"{request_id}:page:{page_idx + 1}"
+        markdown_pages.append(_call_llama_server(image_bytes, page_label))
     markdown_content = "\n\n".join(p for p in markdown_pages if p).strip()
     markdown_content = _post_process_merged_markdown(markdown_content)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# Qwen3-VL parser API dependencies
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.9
@@ -7,8 +7,3 @@ pydantic>=2.0.0
 opencv-python-headless>=4.10.0
 pdf2image>=1.17.0
 huggingface-hub>=0.25.0
-Pillow>=10.0.0
-accelerate>=0.34.0
-torch>=2.4.0
-torchvision>=0.19.0
-transformers @ git+https://github.com/huggingface/transformers.git

+# Unsloth GGUF Qwen3-VL parser API dependencies
 fastapi>=0.115.0
 uvicorn[standard]>=0.32.0
 python-multipart>=0.0.9
 opencv-python-headless>=4.10.0
 pdf2image>=1.17.0
 huggingface-hub>=0.25.0

start.sh CHANGED Viewed

@@ -1,7 +1,15 @@
 #!/bin/bash
-# Start the PaddleOCR-VL + Gemini hybrid parser API
-# Single process: FastAPI with PaddleOCR-VL-1.5 loaded in-process
-# Note: Dockerfile should ensure this script is executable (chmod +x)
-# Start FastAPI
 exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1

 #!/bin/bash
+set -euo pipefail
+/home/user/llama.cpp/build/bin/llama-server \
+  --host 0.0.0.0 \
+  --port 8080 \
+  --hf-repo "${LLAMA_HF_REPO}" \
+  --hf-file "${LLAMA_HF_FILE}" \
+  --mmproj "${LLAMA_MMPROJ_FILE}" \
+  --ctx-size "${LLAMA_CTX_SIZE}" \
+  --n-gpu-layers "${LLAMA_GPU_LAYERS}" \
+  --threads "${LLAMA_THREADS}" \
+  --flash-attn "${LLAMA_FLASH_ATTN}" &
 exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 1