| # NuWave HuggingFace Space β Docker image. | |
| # | |
| # Stack: | |
| # - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat | |
| # - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction | |
| # - bitnet.cpp (microsoft/BitNet) as the inference runtime | |
| # - ng_tract (Rust BTF) for substrate tracts | |
| # - gradio for the UI | |
| # | |
| # Why bitnet.cpp instead of transformers bf16: to actually deliver the | |
| # "CPU-native ternary-weight inference" claim in NuWave's architecture | |
| # docs. Running BitNet through transformers gives BitNet's training | |
| # quality but none of its inference-efficiency benefits. The bitnet.cpp | |
| # runtime uses specialized ternary kernels β ~16Γ memory reduction and | |
| # major throughput gains on CPU, which is what the whole NuWave thesis | |
| # rests on. | |
| # | |
| # Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding | |
| # collapses into repetition loops ("cycle, cycle, cycle") on enumeration | |
| # tasks. Falcon3-10B-Instruct was properly instruct-tuned before being | |
| # quantized to 1.58-bit β inherits Falcon's enumeration capability, | |
| # delivers through bitnet.cpp's fast kernels. | |
| FROM python:3.12-slim | |
| USER root | |
| # System deps β build toolchain for bitnet.cpp, runtime libs for everything. | |
| # libgomp1 β onnxruntime | |
| # git β cloning bitnet.cpp repo + huggingface_hub's git backend | |
| # g++ / clang β compiling bitnet.cpp's C++ kernels | |
| # cmake β bitnet.cpp build system | |
| # build-essential β make, etc. | |
| # ca-certificates / curl β HTTPS downloads | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| libgomp1 \ | |
| git \ | |
| g++ \ | |
| clang \ | |
| cmake \ | |
| build-essential \ | |
| ca-certificates \ | |
| curl \ | |
| pkg-config \ | |
| libcurl4-openssl-dev \ | |
| libssl-dev \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # HF Spaces convention: non-root user with UID 1000 | |
| RUN useradd -m -u 1000 user | |
| USER user | |
| ENV HOME=/home/user \ | |
| PATH=/home/user/.local/bin:$PATH \ | |
| PYTHONDONTWRITEBYTECODE=1 \ | |
| PYTHONUNBUFFERED=1 | |
| # ββ bitnet.cpp clone + build ββββββββββββββββββββββββββββββββββββββββ | |
| # Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork). | |
| # Pinned commit for reproducibility; bump when a new release is needed. | |
| WORKDIR /home/user/bitnet | |
| RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet | |
| # Install BitNet's Python build/utility deps (gguf, huggingface_hub, | |
| # numpy, torch-cpu, etc. β used by their conversion + download scripts, | |
| # which we're about to bypass for the compile, but keep for ggml tools). | |
| RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt | |
| # Patch upstream const-correctness bug in ggml-bitnet-mad.cpp. | |
| # Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;` | |
| # because y is declared `const int8_t *` a few lines above. These | |
| # vec-dot functions only READ y, so const-qualifying y_col is the | |
| # safe + minimal fix. Upstream likely builds with an older compiler | |
| # that downgraded this to a warning. Remove this patch if/when | |
| # microsoft/BitNet fixes this in their source. | |
| RUN sed -i \ | |
| 's|int8_t \* y_col = y + col \* by;|const int8_t * y_col = y + col * by;|g' \ | |
| /home/user/bitnet/src/ggml-bitnet-mad.cpp \ | |
| && grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp || true | |
| # Run setup_env.py β REQUIRED. It codegens bitnet-lut-kernels.h before | |
| # cmake, which cmake can't build without. The quant flag (-q i2_s / | |
| # tl1 / tl2) drives which ternary lookup table gets generated. On | |
| # failure, dump every cmake + compile log we can find so the next | |
| # build iteration has signal instead of just "check details in | |
| # logs/compile.log" (useless in a headless container). | |
| # | |
| # CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy | |
| # kernel compilation doesn't OOM the HF build runner. setup_env.py | |
| # forwards this env var through to cmake. | |
| ENV CMAKE_BUILD_PARALLEL_LEVEL=2 | |
| # setup_env.py does three things: (1) codegens bitnet-lut-kernels.h | |
| # from the quant flag, (2) configures+builds llama-cli via cmake, | |
| # (3) downloads safetensors + converts to GGUF + quantizes. Only | |
| # steps 1-2 are load-bearing for us β step 3 is currently broken | |
| # upstream (convert-hf-to-gguf-bitnet.py doesn't recognize | |
| # BitNetForCausalLM architecture), and we don't need it anyway | |
| # because the pre-converted GGUF is published at a separate HF repo. | |
| # | |
| # Allow setup_env.py to fail at the convert step with `|| true`. | |
| # Then verify the binary actually exists β if codegen or compile | |
| # failed, that's still a hard failure we need to see. | |
| RUN cd /home/user/bitnet \ | |
| && (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s || true) \ | |
| && (test -f /home/user/bitnet/build/bin/llama-cli || \ | |
| (echo "==================== BINARY NOT BUILT ====================" && \ | |
| find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null | \ | |
| while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null || true; done ; \ | |
| echo "==================== END LOGS ====================" && \ | |
| exit 1)) \ | |
| && echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)" | |
| # Download BOTH GGUF files directly from their pre-built HF repos. | |
| # Avoids the broken local-conversion path that setup_env.py attempts. | |
| RUN python -c "\ | |
| from huggingface_hub import snapshot_download; \ | |
| snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \ | |
| local_dir='/home/user/models/bitnet-2b-gguf', \ | |
| allow_patterns=['*.gguf']); \ | |
| snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \ | |
| local_dir='/home/user/models/falcon3-10b-gguf', \ | |
| allow_patterns=['*.gguf'])" | |
| # Paths exposed to app.py via env vars. GGUF files now live in | |
| # /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean | |
| # snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf | |
| # via recursive glob at runtime. | |
| ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli | |
| ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf | |
| ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf | |
| # ββ Python app deps + repo ββββββββββββββββββββββββββββββββββββββββββ | |
| WORKDIR /app | |
| # Install pure-Python deps first β separate layer for caching | |
| COPY --chown=user:user requirements.txt /app/requirements.txt | |
| RUN pip install --no-cache-dir --user -r /app/requirements.txt | |
| # Copy repo | |
| COPY --chown=user:user . /app | |
| # Install vendored ng_tract wheel | |
| RUN pip install --no-cache-dir --user --force-reinstall --no-deps \ | |
| /app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl | |
| # Gradio on 7860 | |
| ENV GRADIO_SERVER_NAME=0.0.0.0 \ | |
| GRADIO_SERVER_PORT=7860 | |
| EXPOSE 7860 | |
| CMD ["python", "app.py"] | |