File size: 6,923 Bytes
e87f1a6 4a1690a e87f1a6 4a1690a e87f1a6 4a1690a e87f1a6 c26fe57 4a1690a 490fa67 e87f1a6 4a1690a e87f1a6 4a1690a 490fa67 4a1690a 9018cad 6bf194c eeba176 4a1690a eeba176 6bf194c eeba176 4a1690a eeba176 4a1690a eeba176 4a1690a eeba176 4a1690a eeba176 4a1690a e87f1a6 4a1690a e87f1a6 4a1690a e87f1a6 4a1690a e87f1a6 4a1690a e87f1a6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | # NuWave HuggingFace Space β Docker image.
#
# Stack:
# - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat
# - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction
# - bitnet.cpp (microsoft/BitNet) as the inference runtime
# - ng_tract (Rust BTF) for substrate tracts
# - gradio for the UI
#
# Why bitnet.cpp instead of transformers bf16: to actually deliver the
# "CPU-native ternary-weight inference" claim in NuWave's architecture
# docs. Running BitNet through transformers gives BitNet's training
# quality but none of its inference-efficiency benefits. The bitnet.cpp
# runtime uses specialized ternary kernels β ~16Γ memory reduction and
# major throughput gains on CPU, which is what the whole NuWave thesis
# rests on.
#
# Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding
# collapses into repetition loops ("cycle, cycle, cycle") on enumeration
# tasks. Falcon3-10B-Instruct was properly instruct-tuned before being
# quantized to 1.58-bit β inherits Falcon's enumeration capability,
# delivers through bitnet.cpp's fast kernels.
FROM python:3.12-slim
USER root
# System deps β build toolchain for bitnet.cpp, runtime libs for everything.
# libgomp1 β onnxruntime
# git β cloning bitnet.cpp repo + huggingface_hub's git backend
# g++ / clang β compiling bitnet.cpp's C++ kernels
# cmake β bitnet.cpp build system
# build-essential β make, etc.
# ca-certificates / curl β HTTPS downloads
RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
git \
g++ \
clang \
cmake \
build-essential \
ca-certificates \
curl \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
&& rm -rf /var/lib/apt/lists/*
# HF Spaces convention: non-root user with UID 1000
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
# ββ bitnet.cpp clone + build ββββββββββββββββββββββββββββββββββββββββ
# Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork).
# Pinned commit for reproducibility; bump when a new release is needed.
WORKDIR /home/user/bitnet
RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet
# Install BitNet's Python build/utility deps (gguf, huggingface_hub,
# numpy, torch-cpu, etc. β used by their conversion + download scripts,
# which we're about to bypass for the compile, but keep for ggml tools).
RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt
# Patch upstream const-correctness bug in ggml-bitnet-mad.cpp.
# Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;`
# because y is declared `const int8_t *` a few lines above. These
# vec-dot functions only READ y, so const-qualifying y_col is the
# safe + minimal fix. Upstream likely builds with an older compiler
# that downgraded this to a warning. Remove this patch if/when
# microsoft/BitNet fixes this in their source.
RUN sed -i \
's|int8_t \* y_col = y + col \* by;|const int8_t * y_col = y + col * by;|g' \
/home/user/bitnet/src/ggml-bitnet-mad.cpp \
&& grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp || true
# Run setup_env.py β REQUIRED. It codegens bitnet-lut-kernels.h before
# cmake, which cmake can't build without. The quant flag (-q i2_s /
# tl1 / tl2) drives which ternary lookup table gets generated. On
# failure, dump every cmake + compile log we can find so the next
# build iteration has signal instead of just "check details in
# logs/compile.log" (useless in a headless container).
#
# CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy
# kernel compilation doesn't OOM the HF build runner. setup_env.py
# forwards this env var through to cmake.
ENV CMAKE_BUILD_PARALLEL_LEVEL=2
# setup_env.py does three things: (1) codegens bitnet-lut-kernels.h
# from the quant flag, (2) configures+builds llama-cli via cmake,
# (3) downloads safetensors + converts to GGUF + quantizes. Only
# steps 1-2 are load-bearing for us β step 3 is currently broken
# upstream (convert-hf-to-gguf-bitnet.py doesn't recognize
# BitNetForCausalLM architecture), and we don't need it anyway
# because the pre-converted GGUF is published at a separate HF repo.
#
# Allow setup_env.py to fail at the convert step with `|| true`.
# Then verify the binary actually exists β if codegen or compile
# failed, that's still a hard failure we need to see.
RUN cd /home/user/bitnet \
&& (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s || true) \
&& (test -f /home/user/bitnet/build/bin/llama-cli || \
(echo "==================== BINARY NOT BUILT ====================" && \
find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null | \
while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null || true; done ; \
echo "==================== END LOGS ====================" && \
exit 1)) \
&& echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)"
# Download BOTH GGUF files directly from their pre-built HF repos.
# Avoids the broken local-conversion path that setup_env.py attempts.
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \
local_dir='/home/user/models/bitnet-2b-gguf', \
allow_patterns=['*.gguf']); \
snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \
local_dir='/home/user/models/falcon3-10b-gguf', \
allow_patterns=['*.gguf'])"
# Paths exposed to app.py via env vars. GGUF files now live in
# /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean
# snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf
# via recursive glob at runtime.
ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli
ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf
ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf
# ββ Python app deps + repo ββββββββββββββββββββββββββββββββββββββββββ
WORKDIR /app
# Install pure-Python deps first β separate layer for caching
COPY --chown=user:user requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --user -r /app/requirements.txt
# Copy repo
COPY --chown=user:user . /app
# Install vendored ng_tract wheel
RUN pip install --no-cache-dir --user --force-reinstall --no-deps \
/app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl
# Gradio on 7860
ENV GRADIO_SERVER_NAME=0.0.0.0 \
GRADIO_SERVER_PORT=7860
EXPOSE 7860
CMD ["python", "app.py"]
|