Spaces:
Running on T4
Running on T4
| """Configuration, environment variables, and logging setup for the parser.""" | |
| import logging | |
| import os | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s | %(levelname)-8s | %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| logger = logging.getLogger("docling-parser") | |
| API_TOKEN = os.getenv("API_TOKEN") | |
| IMAGES_SCALE = float(os.getenv("IMAGES_SCALE", "2.0")) | |
| MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "1024")) | |
| MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 | |
| RENDER_DPI = int(os.getenv("RENDER_DPI", "200")) | |
| DOCLING_DEVICE = os.getenv("DOCLING_DEVICE", "auto") | |
| DOCLING_NUM_THREADS = int(os.getenv("DOCLING_NUM_THREADS", "4")) | |
| BITMAP_AREA_THRESHOLD = float(os.getenv("BITMAP_AREA_THRESHOLD", "0.05")) | |
| SPARSE_TEXT_THRESHOLD = int(os.getenv("SPARSE_TEXT_THRESHOLD", "80")) | |
| IMAGE_DOMINANT_THRESHOLD = float(os.getenv("IMAGE_DOMINANT_THRESHOLD", "0.75")) | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") | |
| GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-3-flash-preview") | |
| GEMINI_TIMEOUT = float(os.getenv("GEMINI_TIMEOUT", "120")) | |
| GEMINI_CONCURRENCY = int(os.getenv("GEMINI_CONCURRENCY", "8")) | |
| # Concurrency tuning | |
| # THREAD_POOL_SIZE: replaces the default asyncio executor (min(32, cpu+4) ≈ 8 | |
| # on a 4-vCPU T4). 32 lets the queue drain much faster under burst load. | |
| # EXCEL_CONCURRENCY: semaphore cap on simultaneous Excel jobs. Prevents OOM | |
| # when many large workbooks arrive at once (openpyxl loads full file into RAM). | |
| THREAD_POOL_SIZE = int(os.getenv("THREAD_POOL_SIZE", "32")) | |
| EXCEL_CONCURRENCY = int(os.getenv("EXCEL_CONCURRENCY", "20")) | |
| BLOCKED_HOSTNAMES = { | |
| "localhost", | |
| "metadata", | |
| "metadata.google.internal", | |
| "metadata.google", | |
| "169.254.169.254", | |
| "fd00:ec2::254", | |
| } | |