File size: 1,033 Bytes
a05fd16 1272bb1 a05fd16 1272bb1 a05fd16 af12f66 a05fd16 1272bb1 aee9135 1272bb1 a05fd16 1272bb1 aee9135 1272bb1 af12f66 a05fd16 1272bb1 7a54a6f a05fd16 1272bb1 a05fd16 1272bb1 ddd94bd 7a54a6f 1272bb1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | FROM ubuntu:22.04
# Install required packages (no ninja)
RUN apt-get update && apt-get install -y \
git cmake build-essential curl wget \
libcurl4-openssl-dev libssl-dev && \
rm -rf /var/lib/apt/lists/*
# Clone llama.cpp
RUN git clone https://github.com/ggml-org/llama.cpp /opt/llama.cpp
WORKDIR /opt/llama.cpp
# Build llama-server with CURL and without ninja
RUN mkdir build && cd build && \
cmake .. -DLLAMA_SERVER=ON -DLLAMA_CURL=ON && \
make -j$(nproc)
# Download the Qwen2 0.5B Instruct model
RUN mkdir /models && \
wget -qO /models/qwen2-0_5b-instruct-q8_0.gguf \
https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF/resolve/main/qwen2-0_5b-instruct-q8_0.gguf
# Expose server port
EXPOSE 7860
# Run llama-server with parallel slots and continuous batching
ENTRYPOINT ["/opt/llama.cpp/build/bin/llama-server", \
"-m", "/models/qwen2-0_5b-instruct-q8_0.gguf", \
"--threads", "4", "--threads-batch", "4", \
"--host", "0.0.0.0", "--port", "7860", \
"-np", "4", "--cont-batching"]
|