# ============================================================ # Qwen3-14B – OpenAI-compatible API – CPU-only Docker image # ============================================================ FROM python:3.11-slim # Build-time deps for llama-cpp-python (needs a C++ compiler) RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ wget \ ca-certificates \ && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY requirements.txt . # ── Python deps (single source of truth: requirements.txt) ─── # Install llama-cpp-python from abetlen's prebuilt CPU wheel index # instead of compiling it from source: building llama.cpp's C++ tree # from scratch spawns several parallel compiler processes and was # OOMing the build (exit 137) on the platform's build container. # CMAKE_ARGS / CMAKE_BUILD_PARALLEL_LEVEL only matter if pip ever has # to fall back to a source build (e.g. no matching wheel for this # platform yet) -- they keep that fallback CPU-only and memory-bounded. RUN CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_OPENCL=OFF" \ CMAKE_BUILD_PARALLEL_LEVEL=1 \ pip install --no-cache-dir -r requirements.txt \ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu # ── App code ───────────────────────────────────────────────── COPY app.py . # ── Storage ────────────────────────────────────────────────── # /data is the HF Spaces persistent storage bucket. # Model is downloaded here on first boot and reused on restarts. RUN mkdir -p /data # ── Runtime env defaults (override with -e or docker-compose) ─ ENV MODEL_PATH=/data/qwen3-14b-q4_k_m.gguf \ MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-14B-GGUF/resolve/main/Qwen_Qwen3-14B-Q4_K_M.gguf \ MODEL_ID=qwen3-14b \ N_CTX=4096 \ N_THREADS=8 \ N_BATCH=512 \ VERBOSE=false EXPOSE 7860 # Health check — /health returns {"ready": true} once the model is loaded HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=20 \ CMD wget -qO- http://localhost:7860/health | grep -q '"ready": true' || exit 1 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]