Spaces:
Sleeping
Sleeping
| FROM ubuntu:22.04 AS builder | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| cmake \ | |
| build-essential \ | |
| python3 \ | |
| python3-pip \ | |
| git \ | |
| wget \ | |
| software-properties-common \ | |
| gnupg \ | |
| libomp-dev \ | |
| && wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | tee /etc/apt/trusted.gpg.d/llvm.asc \ | |
| && add-apt-repository -y "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" \ | |
| && apt-get update && apt-get install -y --no-install-recommends clang-18 \ | |
| && ln -s /usr/bin/clang-18 /usr/bin/clang \ | |
| && ln -s /usr/bin/clang++-18 /usr/bin/clang++ \ | |
| && rm -rf /var/lib/apt/lists/* | |
| WORKDIR /build | |
| RUN git clone --recursive https://github.com/microsoft/BitNet.git . | |
| RUN pip3 install --no-cache-dir 3rdparty/llama.cpp/gguf-py | |
| RUN sed -i 's/int8_t \* y_col = y + col \* by;/const int8_t * y_col = y + col * by;/' src/ggml-bitnet-mad.cpp | |
| RUN python3 utils/codegen_tl2.py \ | |
| --model bitnet_b1_58-3B \ | |
| --BM 160,320,320 \ | |
| --BK 96,96,96 \ | |
| --bm 32,32,32 | |
| RUN cmake -B build \ | |
| -DBITNET_X86_TL2=OFF \ | |
| -DCMAKE_C_COMPILER=clang \ | |
| -DCMAKE_CXX_COMPILER=clang++ \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| && cmake --build build --config Release -j2 --target llama-server | |
| FROM ubuntu:22.04 | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| libgomp1 \ | |
| && rm -rf /var/lib/apt/lists/* | |
| WORKDIR /app | |
| COPY --from=builder /build/build/bin/llama-server ./build/bin/llama-server | |
| COPY --from=builder /build/build/3rdparty/llama.cpp/src/libllama.so ./build/lib/ | |
| COPY --from=builder /build/build/3rdparty/llama.cpp/ggml/src/libggml.so ./build/lib/ | |
| ENV LD_LIBRARY_PATH=/app/build/lib | |
| EXPOSE 8080 | |
| ENTRYPOINT ["./build/bin/llama-server", "--host", "0.0.0.0", "--port", "8080"] | |
| CMD ["-t", "2", "-tb", "2", "--threads-http", "2", "--mlock", "--parallel", "2"] | |