Spaces:

Xocen
/

llm687

Sleeping

File size: 1,033 Bytes

a05fd16
 
1272bb1
a05fd16
1272bb1
 
 
a05fd16
af12f66
a05fd16
 
 
 
1272bb1
aee9135
1272bb1
 
a05fd16
1272bb1
aee9135
1272bb1
af12f66
a05fd16
1272bb1
7a54a6f
a05fd16
1272bb1
a05fd16
1272bb1
ddd94bd
7a54a6f
1272bb1

FROM ubuntu:22.04

# Install required packages (no ninja)
RUN apt-get update && apt-get install -y \
    git cmake build-essential curl wget \
    libcurl4-openssl-dev libssl-dev && \
    rm -rf /var/lib/apt/lists/*

# Clone llama.cpp
RUN git clone https://github.com/ggml-org/llama.cpp /opt/llama.cpp

WORKDIR /opt/llama.cpp

# Build llama-server with CURL and without ninja
RUN mkdir build && cd build && \
    cmake .. -DLLAMA_SERVER=ON -DLLAMA_CURL=ON && \
    make -j$(nproc)

# Download the Qwen2 0.5B Instruct model
RUN mkdir /models && \
    wget -qO /models/qwen2-0_5b-instruct-q8_0.gguf \
    https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF/resolve/main/qwen2-0_5b-instruct-q8_0.gguf

# Expose server port
EXPOSE 7860

# Run llama-server with parallel slots and continuous batching
ENTRYPOINT ["/opt/llama.cpp/build/bin/llama-server", \
    "-m", "/models/qwen2-0_5b-instruct-q8_0.gguf", \
    "--threads", "4", "--threads-batch", "4", \
    "--host", "0.0.0.0", "--port", "7860", \
    "-np", "4", "--cont-batching"]