Spaces:

fomext
/

fallback_module_trial

Sleeping

App Files Files Community

fallback_module_trial / Dockerfile

fomext

Upload 2 files

5f2a604 verified 17 days ago

Raw

History Blame Contribute Delete

2.43 kB

	# ============================================================
	# Qwen3-14B – OpenAI-compatible API – CPU-only Docker image
	# ============================================================
	FROM python:3.11-slim

	# Build-time deps for llama-cpp-python (needs a C++ compiler)
	RUN apt-get update && apt-get install -y --no-install-recommends \
	build-essential \
	cmake \
	git \
	wget \
	ca-certificates \
	&& rm -rf /var/lib/apt/lists/*

	WORKDIR /app

	COPY requirements.txt .

	# ── Python deps (single source of truth: requirements.txt) ───
	# Install llama-cpp-python from abetlen's prebuilt CPU wheel index
	# instead of compiling it from source: building llama.cpp's C++ tree
	# from scratch spawns several parallel compiler processes and was
	# OOMing the build (exit 137) on the platform's build container.
	# CMAKE_ARGS / CMAKE_BUILD_PARALLEL_LEVEL only matter if pip ever has
	# to fall back to a source build (e.g. no matching wheel for this
	# platform yet) -- they keep that fallback CPU-only and memory-bounded.
	RUN CMAKE_ARGS="-DGGML_CUDA=OFF -DGGML_METAL=OFF -DGGML_OPENCL=OFF" \
	CMAKE_BUILD_PARALLEL_LEVEL=1 \
	pip install --no-cache-dir -r requirements.txt \
	--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu

	# ── App code ─────────────────────────────────────────────────
	COPY app.py .

	# ── Storage ──────────────────────────────────────────────────
	# /data is the HF Spaces persistent storage bucket.
	# Model is downloaded here on first boot and reused on restarts.
	RUN mkdir -p /data

	# ── Runtime env defaults (override with -e or docker-compose) ─
	ENV MODEL_PATH=/data/qwen3-14b-q4_k_m.gguf \
	MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-14B-GGUF/resolve/main/Qwen_Qwen3-14B-Q4_K_M.gguf \
	MODEL_ID=qwen3-14b \
	N_CTX=4096 \
	N_THREADS=8 \
	N_BATCH=512 \
	VERBOSE=false

	EXPOSE 7860

	# Health check — /health returns {"ready": true} once the model is loaded
	HEALTHCHECK --interval=30s --timeout=10s --start-period=600s --retries=20 \
	CMD wget -qO- http://localhost:7860/health \| grep -q '"ready": true' \|\| exit 1

	CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]