# ============================================================================ # Stage 1: Build llama-cpp-python with CPU optimizations # ============================================================================ FROM python:3.11-slim AS builder # Install build dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ libopenblas-dev \ pkg-config \ && rm -rf /var/lib/apt/lists/* # Build llama-cpp-python with OpenBLAS + AVX2 # This is the key advantage over the Gradio wheel — compiled for this CPU ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DGGML_AVX2=ON -DGGML_FMA=ON" ENV FORCE_CMAKE=1 RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir llama-cpp-python==0.3.4 # ============================================================================ # Stage 2: Runtime image (lean) # ============================================================================ FROM python:3.11-slim AS runtime # Only the runtime lib needed for OpenBLAS RUN apt-get update && apt-get install -y --no-install-recommends \ libopenblas0 \ && rm -rf /var/lib/apt/lists/* # Copy the compiled llama-cpp-python from builder COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages COPY --from=builder /usr/local/bin /usr/local/bin # Install remaining Python deps (no llama-cpp here — already copied) RUN pip install --no-cache-dir \ fastapi==0.115.0 \ uvicorn[standard]==0.30.0 \ huggingface_hub==0.24.0 \ pydantic==2.8.0 # Create non-root user (HF Spaces requirement) RUN useradd -m -u 1000 user USER user WORKDIR /app # Copy app code COPY --chown=user:user app.py . # HF Spaces persistent storage mounts at /data # We just make sure the path exists as a fallback if not mounted RUN mkdir -p /home/user/data/models EXPOSE 7860 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--log-level", "info"]