# Stage 1: Grab the server binary from the official server image FROM ghcr.io/ggml-org/llama.cpp:server AS source # Stage 2: Setup your environment using the full image FROM ghcr.io/ggml-org/llama.cpp:full # 1. Copy llama-server binary COPY --from=source /app/llama-server /usr/local/bin/llama-server # 2. Setup Environment WORKDIR /app ENV HF_HOME=/app/.cache ENV NUMBA_CACHE_DIR=/app/.cache ENV LLAMA_CACHE=/app/.cache/llama.cpp # 3. Install Python & Tools RUN apt-get update && \ apt-get install -y --no-install-recommends \ python3 \ python3-pip \ dos2unix \ curl \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean # 4. Install Python Libraries RUN pip3 install --no-cache-dir --break-system-packages huggingface_hub # 5. Copy and Fix the Start Script COPY start.sh /app/start.sh RUN dos2unix /app/start.sh && chmod +x /app/start.sh # 6. Create directories and set permissions RUN mkdir -p /app/.cache/llama.cpp \ && mkdir -p /app/models \ && chmod -R 777 /app # 7. Expose port EXPOSE 7860 # 8. Health check HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ CMD curl -f http://localhost:7860/health || exit 1 # 9. Start ENTRYPOINT ["/app/start.sh"]