FROM ubuntu:22.04 # Install required packages (no ninja) RUN apt-get update && apt-get install -y \ git cmake build-essential curl wget \ libcurl4-openssl-dev libssl-dev && \ rm -rf /var/lib/apt/lists/* # Clone llama.cpp RUN git clone https://github.com/ggml-org/llama.cpp /opt/llama.cpp WORKDIR /opt/llama.cpp # Build llama-server with CURL and without ninja RUN mkdir build && cd build && \ cmake .. -DLLAMA_SERVER=ON -DLLAMA_CURL=ON && \ make -j$(nproc) ARG MODEL_URL=https://huggingface.co/ggml-org/gemma-3n-E2B-it-GGUF/resolve/main/gemma-3n-E2B-it-Q8_0.gguf # Download the Qwen2 0.5B Instruct model RUN mkdir /models && \ wget -qO /models/model.gguf \ ${MODEL_URL} # Expose server port EXPOSE 7860 # Run llama-server with parallel slots and continuous batching ENTRYPOINT ["/opt/llama.cpp/build/bin/llama-server", \ "-m", "/models/model.gguf", \ "--threads", "4", "--threads-batch", "4", \ "--host", "0.0.0.0", "--port", "7860", \ "-np", "4", "--cont-batching", \ "--no-mmap", "--mlock", \ "--ctx-size", "2048"]