docling-parser / config.py
ibadrehman-outcome's picture
perf: concurrency improvements for high-volume Excel processing
33af535
"""Configuration, environment variables, and logging setup for the parser."""
import logging
import os
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)-8s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("docling-parser")
API_TOKEN = os.getenv("API_TOKEN")
IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0"))
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024"))
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
RENDER_DPI = int(os.getenv("RENDER_DPI", "200"))
DOCLING_DEVICE = os.getenv("DOCLING_DEVICE", "auto")
DOCLING_NUM_THREADS = int(os.getenv("DOCLING_NUM_THREADS", "4"))
BITMAP_AREA_THRESHOLD = float(os.getenv("BITMAP_AREA_THRESHOLD", "0.05"))
SPARSE_TEXT_THRESHOLD = int(os.getenv("SPARSE_TEXT_THRESHOLD", "80"))
IMAGE_DOMINANT_THRESHOLD = float(os.getenv("IMAGE_DOMINANT_THRESHOLD", "0.75"))
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview")
GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120"))
GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8"))
# Concurrency tuning
# THREAD_POOL_SIZE: replaces the default asyncio executor (min(32, cpu+4) ≈ 8
# on a 4-vCPU T4). 32 lets the queue drain much faster under burst load.
# EXCEL_CONCURRENCY: semaphore cap on simultaneous Excel jobs. Prevents OOM
# when many large workbooks arrive at once (openpyxl loads full file into RAM).
THREAD_POOL_SIZE = int(os.getenv("THREAD_POOL_SIZE", "32"))
EXCEL_CONCURRENCY = int(os.getenv("EXCEL_CONCURRENCY", "20"))
BLOCKED_HOSTNAMES = {
"localhost",
"metadata",
"metadata.google.internal",
"metadata.google",
"169.254.169.254",
"fd00:ec2::254",
}