fallback_module_trial / Dockerfile
fomext's picture
Upload 2 files
5f2a604 verified
Raw
History Blame Contribute Delete
2.43 kB
# ============================================================
# Qwen3-14B – OpenAI-compatible API – CPU-only Docker image
# ============================================================
FROM python:3.11-slim
# Build-time deps for llama-cpp-python (needs a C++ compiler)
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
wget \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
# ── Python deps (single source of truth: requirements.txt) ───
# Install llama-cpp-python from abetlen's prebuilt CPU wheel index
# instead of compiling it from source: building llama.cpp's C++ tree
# from scratch spawns several parallel compiler processes and was
# OOMing the build (exit 137) on the platform's build container.
# CMAKE_ARGS / CMAKE_BUILD_PARALLEL_LEVEL only matter if pip ever has
# to fall back to a source build (e.g. no matching wheel for this
# platform yet) -- they keep that fallback CPU-only and memory-bounded.
RUN CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_OPENCL=OFF" \
CMAKE_BUILD_PARALLEL_LEVEL=1 \
pip install --no-cache-dir -r requirements.txt \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
# ── App code ─────────────────────────────────────────────────
COPY app.py .
# ── Storage ──────────────────────────────────────────────────
# /data is the HF Spaces persistent storage bucket.
# Model is downloaded here on first boot and reused on restarts.
RUN mkdir -p /data
# ── Runtime env defaults (override with -e or docker-compose) ─
ENV MODEL_PATH=/data/qwen3-14b-q4_k_m.gguf \
MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-14B-GGUF/resolve/main/Qwen_Qwen3-14B-Q4_K_M.gguf \
MODEL_ID=qwen3-14b \
N_CTX=4096 \
N_THREADS=8 \
N_BATCH=512 \
VERBOSE=false
EXPOSE 7860
# Health check β€” /health returns {"ready": true} once the model is loaded
HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=20 \
CMD wget -qO- http://localhost:7860/health | grep -q '"ready": true' || exit 1
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]