Spaces:
Running on T4
Running on T4
| """ | |
| Docling VLM Parser API v6.0.0 | |
| A FastAPI service using a Docling-first + Gemini hybrid architecture for | |
| document parsing: | |
| Pass 1: Docling on a PDF slice or full input (no OCR) | |
| Pass 2 (API): Gemini on table pages and weak-text pages | |
| Post: Cross-page artifact removal, table cleanup, deduplication | |
| """ | |
| import asyncio | |
| import concurrent.futures | |
| import re | |
| import shutil | |
| import tempfile | |
| import time | |
| from contextlib import asynccontextmanager | |
| from pathlib import Path | |
| from typing import Optional | |
| from uuid import uuid4 | |
| import httpx | |
| from fastapi import Depends, FastAPI, File, Form, HTTPException, UploadFile | |
| from auth import _validate_url, verify_token | |
| from config import ( | |
| BITMAP_AREA_THRESHOLD, | |
| DOCLING_DEVICE, | |
| DOCLING_NUM_THREADS, | |
| EXCEL_CONCURRENCY, | |
| GEMINI_API_KEY, | |
| GEMINI_CONCURRENCY, | |
| GEMINI_MODEL, | |
| IMAGE_DOMINANT_THRESHOLD, | |
| IMAGES_SCALE, | |
| MAX_FILE_SIZE_BYTES, | |
| MAX_FILE_SIZE_MB, | |
| RENDER_DPI, | |
| SPARSE_TEXT_THRESHOLD, | |
| THREAD_POOL_SIZE, | |
| logger, | |
| ) | |
| # Semaphore that caps simultaneous Excel conversions across all requests | |
| # handled by this worker process. Prevents OOM when many large workbooks | |
| # arrive concurrently (openpyxl loads the full workbook into RAM). | |
| _excel_semaphore = asyncio.Semaphore(EXCEL_CONCURRENCY) | |
| from models import HealthResponse, ParseResponse, URLParseRequest | |
| from excel_pipeline import _convert_excel | |
| from pipeline import ( | |
| _convert_document, | |
| _create_images_zip, | |
| _get_converter, | |
| _save_downloaded_content, | |
| _save_uploaded_file, | |
| ) | |
| async def lifespan(app: FastAPI): | |
| """Startup: configure thread pool, initialize Docling converter.""" | |
| # Replace the default asyncio executor (min(32, cpu+4) ≈ 8 on T4) with a | |
| # larger pool so burst Excel/PDF requests drain the queue faster instead of | |
| # stacking up waiting for a free thread. | |
| executor = concurrent.futures.ThreadPoolExecutor( | |
| max_workers=THREAD_POOL_SIZE, | |
| thread_name_prefix="parser", | |
| ) | |
| asyncio.get_running_loop().set_default_executor(executor) | |
| logger.info("=" * 60) | |
| logger.info("Starting Docling VLM Parser API v6.0.0...") | |
| logger.info(f"Thread pool size: {THREAD_POOL_SIZE}") | |
| logger.info(f"Excel concurrency cap: {EXCEL_CONCURRENCY}") | |
| logger.info("Initializing Docling converter...") | |
| _get_converter() | |
| logger.info("Docling converter ready") | |
| logger.info(f"Render DPI: {RENDER_DPI}") | |
| logger.info(f"Images scale: {IMAGES_SCALE}") | |
| logger.info(f"Max file size: {MAX_FILE_SIZE_MB}MB") | |
| logger.info(f"Docling device: {DOCLING_DEVICE}") | |
| logger.info(f"Docling threads: {DOCLING_NUM_THREADS}") | |
| logger.info(f"Bitmap area threshold: {BITMAP_AREA_THRESHOLD}") | |
| logger.info(f"Sparse text threshold: {SPARSE_TEXT_THRESHOLD}") | |
| logger.info(f"Image dominant threshold: {IMAGE_DOMINANT_THRESHOLD}") | |
| logger.info(f"Gemini Model: {GEMINI_MODEL}") | |
| logger.info(f"Gemini API Key: {'configured' if GEMINI_API_KEY else 'NOT SET'}") | |
| logger.info(f"Gemini Concurrency: {GEMINI_CONCURRENCY}") | |
| logger.info("=" * 60) | |
| logger.info("Docling VLM Parser API ready (Docling-first + Gemini hybrid)") | |
| logger.info("=" * 60) | |
| yield | |
| logger.info("Shutting down Docling VLM Parser API...") | |
| executor.shutdown(wait=False) | |
| app = FastAPI( | |
| title="Docling VLM Parser API", | |
| description="Docling-first + Gemini hybrid parser", | |
| version="6.0.0", | |
| lifespan=lifespan, | |
| ) | |
| async def health_check() -> HealthResponse: | |
| """Health check endpoint.""" | |
| return HealthResponse( | |
| status="healthy", | |
| version="6.0.0", | |
| model="Docling + Gemini", | |
| gemini_status="configured" if GEMINI_API_KEY else "not set", | |
| images_scale=IMAGES_SCALE, | |
| ) | |
| async def parse_document( | |
| file: UploadFile = File(..., description="PDF file to parse"), | |
| output_format: str = Form(default="markdown", description="Output format: markdown only"), | |
| images_scale: Optional[float] = Form(default=None, description="Reserved for compatibility"), | |
| start_page: int = Form(default=0, description="Starting page (0-indexed)"), | |
| end_page: Optional[int] = Form(default=None, description="Ending page (None = all pages)"), | |
| include_images: bool = Form(default=False, description="Include extracted images"), | |
| _token: str = Depends(verify_token), | |
| ) -> ParseResponse: | |
| """Parse a document file using the hybrid parser.""" | |
| request_id = str(uuid4())[:8] | |
| start_time = time.time() | |
| logger.info(f"[{request_id}] {'=' * 50}") | |
| logger.info(f"[{request_id}] New parse request received") | |
| safe_filename = re.sub(r"[\r\n\t\x00-\x1f\x7f]", "_", file.filename or "")[:255] | |
| logger.info(f"[{request_id}] Filename: {safe_filename}") | |
| logger.info(f"[{request_id}] Output format: {output_format}") | |
| if output_format != "markdown": | |
| raise HTTPException(status_code=400, detail="Only 'markdown' output_format is supported") | |
| if start_page < 0: | |
| raise HTTPException(status_code=400, detail="start_page must be >= 0") | |
| if end_page is not None and end_page < start_page: | |
| raise HTTPException(status_code=400, detail="end_page must be >= start_page") | |
| file.file.seek(0, 2) | |
| file_size = file.file.tell() | |
| file.file.seek(0) | |
| file_size_mb = file_size / (1024 * 1024) | |
| logger.info(f"[{request_id}] File size: {file_size_mb:.2f} MB") | |
| if file_size > MAX_FILE_SIZE_BYTES: | |
| raise HTTPException( | |
| status_code=413, | |
| detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB", | |
| ) | |
| allowed_extensions = {".pdf", ".xlsx", ".xlsm"} | |
| file_ext = Path(file.filename).suffix.lower() if file.filename else "" | |
| if file_ext not in allowed_extensions: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}", | |
| ) | |
| is_excel = file_ext in {".xlsx", ".xlsm"} | |
| if is_excel: | |
| logger.info(f"[{request_id}] File type: Excel ({file_ext})") | |
| else: | |
| logger.info(f"[{request_id}] Page range: {start_page} to {end_page if end_page is not None else 'end'}") | |
| temp_dir = tempfile.mkdtemp() | |
| try: | |
| input_path = Path(temp_dir) / f"input{file_ext}" | |
| await asyncio.to_thread(_save_uploaded_file, input_path, file.file) | |
| output_dir = Path(temp_dir) / "output" | |
| output_dir.mkdir(exist_ok=True) | |
| images_zip = None | |
| image_count = 0 | |
| gemini_pages: list[int] = [] | |
| if is_excel: | |
| async with _excel_semaphore: | |
| markdown_content, json_content, pages_processed = await asyncio.to_thread( | |
| _convert_excel, | |
| input_path, | |
| request_id, | |
| ) | |
| else: | |
| markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread( | |
| _convert_document, | |
| input_path, | |
| output_dir, | |
| include_images, | |
| request_id, | |
| start_page, | |
| end_page, | |
| ) | |
| if include_images and image_count > 0: | |
| images_zip, image_count = _create_images_zip(output_dir) | |
| total_duration = time.time() - start_time | |
| logger.info(f"[{request_id}] Request completed successfully in {total_duration:.2f}s") | |
| return ParseResponse( | |
| success=True, | |
| markdown=markdown_content, | |
| json_content=json_content, | |
| images_zip=images_zip, | |
| image_count=image_count, | |
| pages_processed=pages_processed, | |
| device_used="cpu", | |
| vlm_model="Docling + Gemini" if not is_excel else "openpyxl", | |
| gemini_page_count=len(gemini_pages), | |
| gemini_pages=gemini_pages, | |
| ) | |
| except Exception as e: | |
| total_duration = time.time() - start_time | |
| logger.error(f"[{request_id}] Request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True) | |
| return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})") | |
| finally: | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| async def parse_document_from_url( | |
| request: URLParseRequest, | |
| _token: str = Depends(verify_token), | |
| ) -> ParseResponse: | |
| """Parse a document from URL using the hybrid parser.""" | |
| request_id = str(uuid4())[:8] | |
| start_time = time.time() | |
| logger.info(f"[{request_id}] {'=' * 50}") | |
| logger.info(f"[{request_id}] New URL parse request received") | |
| logger.info(f"[{request_id}] URL: {request.url}") | |
| if request.output_format != "markdown": | |
| raise HTTPException(status_code=400, detail="Only 'markdown' output_format is supported") | |
| if request.start_page < 0: | |
| raise HTTPException(status_code=400, detail="start_page must be >= 0") | |
| if request.end_page is not None and request.end_page < request.start_page: | |
| raise HTTPException(status_code=400, detail="end_page must be >= start_page") | |
| _validate_url(request.url) | |
| temp_dir = tempfile.mkdtemp() | |
| try: | |
| async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client: | |
| response = await client.get(request.url) | |
| response.raise_for_status() | |
| if len(response.content) > MAX_FILE_SIZE_BYTES: | |
| raise HTTPException( | |
| status_code=413, | |
| detail=f"File size exceeds maximum allowed size of {MAX_FILE_SIZE_MB}MB", | |
| ) | |
| url_path = Path(request.url.split("?")[0]) | |
| file_ext = url_path.suffix.lower() | |
| allowed_extensions = {".pdf", ".xlsx", ".xlsm"} | |
| if not file_ext or file_ext not in allowed_extensions: | |
| content_type = response.headers.get("content-type", "").lower() | |
| ct_map = { | |
| "application/pdf": ".pdf", | |
| "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", | |
| "application/vnd.ms-excel.sheet.macroenabled.12": ".xlsm", | |
| } | |
| file_ext = next((v for k, v in ct_map.items() if k in content_type), ".pdf") | |
| if file_ext not in allowed_extensions: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Unsupported file type. Allowed: {', '.join(sorted(allowed_extensions))}", | |
| ) | |
| is_excel = file_ext in {".xlsx", ".xlsm"} | |
| input_path = Path(temp_dir) / f"input{file_ext}" | |
| await asyncio.to_thread(_save_downloaded_content, input_path, response.content) | |
| output_dir = Path(temp_dir) / "output" | |
| output_dir.mkdir(exist_ok=True) | |
| images_zip = None | |
| image_count = 0 | |
| gemini_pages: list[int] = [] | |
| if is_excel: | |
| async with _excel_semaphore: | |
| markdown_content, json_content, pages_processed = await asyncio.to_thread( | |
| _convert_excel, | |
| input_path, | |
| request_id, | |
| ) | |
| else: | |
| markdown_content, json_content, pages_processed, image_count, gemini_pages = await asyncio.to_thread( | |
| _convert_document, | |
| input_path, | |
| output_dir, | |
| request.include_images, | |
| request_id, | |
| request.start_page, | |
| request.end_page, | |
| ) | |
| if request.include_images and image_count > 0: | |
| images_zip, image_count = _create_images_zip(output_dir) | |
| total_duration = time.time() - start_time | |
| logger.info(f"[{request_id}] URL request completed successfully in {total_duration:.2f}s") | |
| return ParseResponse( | |
| success=True, | |
| markdown=markdown_content, | |
| json_content=json_content, | |
| images_zip=images_zip, | |
| image_count=image_count, | |
| pages_processed=pages_processed, | |
| device_used="cpu", | |
| vlm_model="Docling + Gemini" if not is_excel else "openpyxl", | |
| gemini_page_count=len(gemini_pages), | |
| gemini_pages=gemini_pages, | |
| ) | |
| except httpx.HTTPError as e: | |
| logger.error(f"[{request_id}] Download failed: {e}") | |
| return ParseResponse(success=False, error=f"Failed to download file from URL (ref: {request_id})") | |
| except Exception as e: | |
| total_duration = time.time() - start_time | |
| logger.error(f"[{request_id}] URL request failed after {total_duration:.2f}s: {type(e).__name__}: {e}", exc_info=True) | |
| return ParseResponse(success=False, error=f"Processing failed (ref: {request_id})") | |
| finally: | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |