| FROM ubuntu:22.04 | |
| # Install required packages (no ninja) | |
| RUN apt-get update && apt-get install -y \ | |
| git cmake build-essential curl wget \ | |
| libcurl4-openssl-dev libssl-dev && \ | |
| rm -rf /var/lib/apt/lists/* | |
| # Clone llama.cpp | |
| RUN git clone https://github.com/ggml-org/llama.cpp /opt/llama.cpp | |
| WORKDIR /opt/llama.cpp | |
| # Build llama-server with CURL and without ninja | |
| RUN mkdir build && cd build && \ | |
| cmake .. -DLLAMA_SERVER=ON -DLLAMA_CURL=ON && \ | |
| make -j$(nproc) | |
| ARG MODEL_URL=https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf | |
| # Download the Qwen2 0.5B Instruct model | |
| RUN mkdir /models && \ | |
| wget -qO /models/model.gguf \ | |
| ${MODEL_URL} | |
| # Expose server port | |
| EXPOSE 7860 | |
| # Run llama-server with parallel slots and continuous batching | |
| ENTRYPOINT ["/opt/llama.cpp/build/bin/llama-server", \ | |
| "-m", "/models/model.gguf", \ | |
| "--threads", "4", "--threads-batch", "4", \ | |
| "--host", "0.0.0.0", "--port", "7860", \ | |
| "-np", "4", "--cont-batching"] |