"""Configuration, environment variables, and logging setup for the parser.""" import logging import os logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)-8s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger("docling-parser") API_TOKEN = os.getenv("API_TOKEN") IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0")) MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024")) MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 RENDER_DPI = int(os.getenv("RENDER_DPI", "200")) DOCLING_DEVICE = os.getenv("DOCLING_DEVICE", "auto") DOCLING_NUM_THREADS = int(os.getenv("DOCLING_NUM_THREADS", "4")) BITMAP_AREA_THRESHOLD = float(os.getenv("BITMAP_AREA_THRESHOLD", "0.05")) SPARSE_TEXT_THRESHOLD = int(os.getenv("SPARSE_TEXT_THRESHOLD", "80")) IMAGE_DOMINANT_THRESHOLD = float(os.getenv("IMAGE_DOMINANT_THRESHOLD", "0.75")) GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview") GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120")) GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8")) # Concurrency tuning # THREAD_POOL_SIZE: replaces the default asyncio executor (min(32, cpu+4) ≈ 8 # on a 4-vCPU T4). 32 lets the queue drain much faster under burst load. # EXCEL_CONCURRENCY: semaphore cap on simultaneous Excel jobs. Prevents OOM # when many large workbooks arrive at once (openpyxl loads full file into RAM). THREAD_POOL_SIZE = int(os.getenv("THREAD_POOL_SIZE", "32")) EXCEL_CONCURRENCY = int(os.getenv("EXCEL_CONCURRENCY", "20")) BLOCKED_HOSTNAMES = { "localhost", "metadata", "metadata.google.internal", "metadata.google", "169.254.169.254", "fd00:ec2::254", }