Spaces:
Sleeping
Sleeping
| # ============================================================ | |
| # Qwen3-14B β OpenAI-compatible API β CPU-only Docker image | |
| # ============================================================ | |
| FROM python:3.11-slim | |
| # Build-time deps for llama-cpp-python (needs a C++ compiler) | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| build-essential \ | |
| cmake \ | |
| git \ | |
| wget \ | |
| ca-certificates \ | |
| && rm -rf /var/lib/apt/lists/* | |
| WORKDIR /app | |
| COPY requirements.txt . | |
| # ββ Python deps (single source of truth: requirements.txt) βββ | |
| # Install llama-cpp-python from abetlen's prebuilt CPU wheel index | |
| # instead of compiling it from source: building llama.cpp's C++ tree | |
| # from scratch spawns several parallel compiler processes and was | |
| # OOMing the build (exit 137) on the platform's build container. | |
| # CMAKE_ARGS / CMAKE_BUILD_PARALLEL_LEVEL only matter if pip ever has | |
| # to fall back to a source build (e.g. no matching wheel for this | |
| # platform yet) -- they keep that fallback CPU-only and memory-bounded. | |
| RUN CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_OPENCL=OFF" \ | |
| CMAKE_BUILD_PARALLEL_LEVEL=1 \ | |
| pip install --no-cache-dir -r requirements.txt \ | |
| --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu | |
| # ββ App code βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| COPY app.py . | |
| # ββ Storage ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # /data is the HF Spaces persistent storage bucket. | |
| # Model is downloaded here on first boot and reused on restarts. | |
| RUN mkdir -p /data | |
| # ββ Runtime env defaults (override with -e or docker-compose) β | |
| ENV MODEL_PATH=/data/qwen3-14b-q4_k_m.gguf \ | |
| MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-14B-GGUF/resolve/main/Qwen_Qwen3-14B-Q4_K_M.gguf \ | |
| MODEL_ID=qwen3-14b \ | |
| N_CTX=4096 \ | |
| N_THREADS=8 \ | |
| N_BATCH=512 \ | |
| VERBOSE=false | |
| EXPOSE 7860 | |
| # Health check β /health returns {"ready": true} once the model is loaded | |
| HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=20 \ | |
| CMD wget -qO- http://localhost:7860/health | grep -q '"ready": true' || exit 1 | |
| CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"] | |