Spaces:

outcomelabs
/

docling-parser

Running on T4

ibadrehman-outcome commited on 6 days ago

Commit

cf7950b

1 Parent(s): c28aa68

feat: add Excel (.xlsx/.xlsm) parsing support via Docling

Adds excel_pipeline.py with a dedicated Docling DocumentConverter for
InputFormat.XLSX. Both /parse and /parse/url endpoints now accept .xlsx
and .xlsm files. The PDF parsing routine is completely unchanged.

Files changed (2) hide show

app.py +64 -29
excel_pipeline.py +69 -0

app.py CHANGED Viewed

@@ -38,6 +38,7 @@ from config import (
     logger,
 )
 from models import HealthResponse, ParseResponse, URLParseRequest
 from pipeline import (
     _convert_document,
     _create_images_zip,
@@ -134,7 +135,7 @@ async def parse_document(
             detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
         )
-    allowed_extensions = {".pdf"}
     file_ext = Path(file.filename).suffix.lower() if file.filename else ""
     if file_ext not in allowed_extensions:
         raise HTTPException(
@@ -142,7 +143,12 @@ async def parse_document(
             detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
         )
-    logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}")
     temp_dir = tempfile.mkdtemp()
     try:
@@ -152,19 +158,28 @@ async def parse_document(
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
-        markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
-            _convert_document,
-            input_path,
-            output_dir,
-            include_images,
-            request_id,
-            start_page,
-            end_page,
-        )
         images_zip = None
-        if include_images and image_count > 0:
-            images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
         logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s")
@@ -177,7 +192,7 @@ async def parse_document(
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="cpu",
-            vlm_model="Docling + Gemini",
             gemini_page_count=len(gemini_pages),
             gemini_pages=gemini_pages,
         )
@@ -225,32 +240,52 @@ async def parse_document_from_url(
         url_path = Path(request.url.split("?")[0])
         file_ext = url_path.suffix.lower()
-        if not file_ext or file_ext not in {".pdf"}:
             content_type = response.headers.get("content-type", "").lower()
             ct_map = {
                 "application/pdf": ".pdf",
             }
             file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
         input_path = Path(temp_dir) / f"input{file_ext}"
         await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
-        markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
-            _convert_document,
-            input_path,
-            output_dir,
-            request.include_images,
-            request_id,
-            request.start_page,
-            request.end_page,
-        )
         images_zip = None
-        if request.include_images and image_count > 0:
-            images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
         logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s")
@@ -263,7 +298,7 @@ async def parse_document_from_url(
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="cpu",
-            vlm_model="Docling + Gemini",
             gemini_page_count=len(gemini_pages),
             gemini_pages=gemini_pages,
         )

     logger,
 )
 from models import HealthResponse, ParseResponse, URLParseRequest
+from excel_pipeline import _convert_excel
 from pipeline import (
     _convert_document,
     _create_images_zip,
             detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB",
         )
+    allowed_extensions = {".pdf", ".xlsx", ".xlsm"}
     file_ext = Path(file.filename).suffix.lower() if file.filename else ""
     if file_ext not in allowed_extensions:
         raise HTTPException(
             detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
         )
+    is_excel = file_ext in {".xlsx", ".xlsm"}
+    if is_excel:
+        logger.info(f"[{request_id}] File type: Excel ({file_ext})")
+    else:
+        logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}")
     temp_dir = tempfile.mkdtemp()
     try:
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
         images_zip = None
+        image_count = 0
+        gemini_pages: list[int] = []
+        if is_excel:
+            markdown_content, json_content, pages_processed = await asyncio.to_thread(
+                _convert_excel,
+                input_path,
+                request_id,
+            )
+        else:
+            markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
+                _convert_document,
+                input_path,
+                output_dir,
+                include_images,
+                request_id,
+                start_page,
+                end_page,
+            )
+            if include_images and image_count > 0:
+                images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
         logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s")
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="cpu",
+            vlm_model="Docling + Gemini" if not is_excel else "Docling",
             gemini_page_count=len(gemini_pages),
             gemini_pages=gemini_pages,
         )
         url_path = Path(request.url.split("?")[0])
         file_ext = url_path.suffix.lower()
+        allowed_extensions = {".pdf", ".xlsx", ".xlsm"}
+        if not file_ext or file_ext not in allowed_extensions:
             content_type = response.headers.get("content-type", "").lower()
             ct_map = {
                 "application/pdf": ".pdf",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
+                "application/vnd.ms-excel.sheet.macroenabled.12": ".xlsm",
             }
             file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf")
+        if file_ext not in allowed_extensions:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}",
+            )
+        is_excel = file_ext in {".xlsx", ".xlsm"}
         input_path = Path(temp_dir) / f"input{file_ext}"
         await asyncio.to_thread(_save_downloaded_content, input_path, response.content)
         output_dir = Path(temp_dir) / "output"
         output_dir.mkdir(exist_ok=True)
         images_zip = None
+        image_count = 0
+        gemini_pages: list[int] = []
+        if is_excel:
+            markdown_content, json_content, pages_processed = await asyncio.to_thread(
+                _convert_excel,
+                input_path,
+                request_id,
+            )
+        else:
+            markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread(
+                _convert_document,
+                input_path,
+                output_dir,
+                request.include_images,
+                request_id,
+                request.start_page,
+                request.end_page,
+            )
+            if request.include_images and image_count > 0:
+                images_zip, image_count = _create_images_zip(output_dir)
         total_duration = time.time() - start_time
         logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s")
             image_count=image_count,
             pages_processed=pages_processed,
             device_used="cpu",
+            vlm_model="Docling + Gemini" if not is_excel else "Docling",
             gemini_page_count=len(gemini_pages),
             gemini_pages=gemini_pages,
         )

excel_pipeline.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Excel document parsing pipeline using Docling.
+Uses Docling's native InputFormat.XLSX support to convert Excel workbooks
+to markdown. Each worksheet is treated as a page in the DoclingDocument.
+This is intentionally separate from the PDF pipeline (_get_converter /
+_convert_document in pipeline.py) and does not share state with it.
+"""
+import time
+from pathlib import Path
+from docling.datamodel.base_models import InputFormat
+from docling.document_converter import DocumentConverter
+from config import logger
+_excel_converter: DocumentConverter | None = None
+def _get_excel_converter() -> DocumentConverter:
+    """Get or create the global Docling converter for Excel files.
+    A separate singleton from the PDF converter so Excel and PDF
+    processing never share pipeline state.
+    """
+    global _excel_converter
+    if _excel_converter is None:
+        _excel_converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
+        logger.info("Docling Excel converter initialised")
+    return _excel_converter
+def _convert_excel(
+    input_path: Path,
+    request_id: str,
+) -> tuple[str, None, int]:
+    """Convert an Excel workbook (.xlsx / .xlsm) to markdown via Docling.
+    Args:
+        input_path: Path to the workbook file.
+        request_id: Short request ID used for log correlation.
+    Returns:
+        A 3-tuple of:
+          - markdown_content: Docling-generated markdown for all sheets.
+          - json_content:     None (reserved, consistent with PDF pipeline).
+          - sheets_processed: Number of pages (worksheets) Docling processed.
+    Raises:
+        Exception: Re-raises any Docling conversion error for the caller
+                   to handle and surface as a 500 response.
+    """
+    t_start = time.time()
+    logger.info(f"[{request_id}] Starting Excel conversion: {input_path.name}")
+    converter = _get_excel_converter()
+    result = converter.convert(str(input_path))
+    markdown = result.document.export_to_markdown()
+    sheets_processed = len(result.document.pages)
+    elapsed = time.time() - t_start
+    logger.info(
+        f"[{request_id}] Excel conversion complete: "
+        f"{sheets_processed} sheet(s) in {elapsed:.2f}s"
+    )
+    return markdown, None, sheets_processed