FROM python:3.11-slim ENV PIP_NO_CACHE_DIR=1 \ PYTHONUNBUFFERED=1 \ PORT=7860 # Minimal runtime libs (no compilers) RUN apt-get update && apt-get install -y --no-install-recommends \ libgomp1 libopenblas0 \ && rm -rf /var/lib/apt/lists/* WORKDIR /app # ---- Python deps (non-llama first for cache) ---- COPY requirements.txt . RUN python -m pip install --upgrade pip setuptools wheel \ && pip install --no-cache-dir -r requirements.txt # ---- llama-cpp-python from prebuilt wheels (no compiling) ---- # Try a few known-good versions; first one that has a wheel wins. # add build tools only if you must compile RUN apt-get update && apt-get install -y --no-install-recommends build-essential cmake && \ rm -rf /var/lib/apt/lists/* # compile with BLAS off (HF CPU friendly) RUN CMAKE_ARGS="-DLLAMA_BLAS=OFF -DLLAMA_CUBLAS=OFF -DLLAMA_BLAS_VENDOR=NONE" \ pip install "llama-cpp-python==0.3.0" # ---- App code ---- COPY . . # ---- Model path is configurable via env ---- ENV MODEL_PATH=/app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf # ---- Pre-download & copy model to MODEL_PATH ---- RUN python - <<'PY' from huggingface_hub import hf_hub_download import os, shutil dest = os.environ.get("MODEL_PATH", "/app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf") os.makedirs(os.path.dirname(dest), exist_ok=True) p = hf_hub_download( repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", filename="Llama-3.2-3B-Instruct-Q4_K_M.gguf", local_dir=None, local_dir_use_symlinks=False, ) shutil.copy2(p, dest) print("Model copied to:", dest) PY EXPOSE 7860 CMD ["bash", "-lc", "uvicorn app:app --host 0.0.0.0 --port ${PORT}"]