FROM ubuntu:22.04 AS builder ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y --no-install-recommends \ cmake \ build-essential \ python3 \ python3-pip \ git \ wget \ software-properties-common \ gnupg \ libomp-dev \ && wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/llvm.asc \ && add-apt-repository -y "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" \ && apt-get update && apt-get install -y --no-install-recommends clang-18 \ && ln -s /usr/bin/clang-18 /usr/bin/clang \ && ln -s /usr/bin/clang++-18 /usr/bin/clang++ \ && rm -rf /var/lib/apt/lists/* WORKDIR /build RUN git clone --recursive https://github.com/microsoft/BitNet.git . RUN pip3 install --no-cache-dir 3rdparty/llama.cpp/gguf-py RUN sed -i 's/int8_t \* y_col = y + col \* by;/const int8_t * y_col = y + col * by;/' src/ggml-bitnet-mad.cpp RUN python3 utils/codegen_tl2.py \ --model bitnet_b1_58-3B \ --BM 160,320,320 \ --BK 96,96,96 \ --bm 32,32,32 RUN cmake -B build \ -DBITNET_X86_TL2=OFF \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -j2 --target llama-server FROM ubuntu:22.04 ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y --no-install-recommends \ libgomp1 \ && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY --from=builder /build/build/bin/llama-server ./build/bin/llama-server COPY --from=builder /build/build/3rdparty/llama.cpp/src/libllama.so ./build/lib/ COPY --from=builder /build/build/3rdparty/llama.cpp/ggml/src/libggml.so ./build/lib/ ENV LD_LIBRARY_PATH=/app/build/lib EXPOSE 8080 ENTRYPOINT ["./build/bin/llama-server", "--host", "0.0.0.0", "--port", "8080"] CMD ["-t", "2", "-tb", "2", "--threads-http", "2", "--mlock", "--parallel", "2"]