FROM ghcr.io/ggml-org/llama.cpp:server USER root RUN apt-get update && apt-get install -y curl RUN mkdir -p /models && \ curl -L https://huggingface.co/unsloth/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q4_K_M.gguf -o /models/model.gguf && \ chown -R 1000:1000 /models USER 1000 ENV LLAMA_ARG_MODEL=/models/model.gguf ENV LLAMA_ARG_HOST=0.0.0.0 ENV LLAMA_ARG_PORT=7860 ENV LLAMA_ARG_THREADS=8 ENV LLAMA_ARG_BATCH_SIZE=2048 ENV LLAMA_ARG_UBATCH_SIZE=512 ENV LLAMA_ARG_CTX_SIZE=8196 ENV LLAMA_ARG_FLASH_ATTN=true ENV LLAMA_ARG_NO_MMAP=false ENV LLAMA_ARG_MLOCK=true HEALTHCHECK --interval=30s --timeout=15s --start-period=10s --retries=3 \ CMD curl -f http://localhost:7860/health || exit 1 ENTRYPOINT ["/app/llama-server"]