NuWave / Dockerfile
Executor-Tyrant-Framework's picture
Sync from GitHub: aedae31208bb57c61524b2d4f0fcd49d7bd2afba
eeba176 verified
# NuWave HuggingFace Space β€” Docker image.
#
# Stack:
# - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat
# - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction
# - bitnet.cpp (microsoft/BitNet) as the inference runtime
# - ng_tract (Rust BTF) for substrate tracts
# - gradio for the UI
#
# Why bitnet.cpp instead of transformers bf16: to actually deliver the
# "CPU-native ternary-weight inference" claim in NuWave's architecture
# docs. Running BitNet through transformers gives BitNet's training
# quality but none of its inference-efficiency benefits. The bitnet.cpp
# runtime uses specialized ternary kernels β€” ~16Γ— memory reduction and
# major throughput gains on CPU, which is what the whole NuWave thesis
# rests on.
#
# Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding
# collapses into repetition loops ("cycle, cycle, cycle") on enumeration
# tasks. Falcon3-10B-Instruct was properly instruct-tuned before being
# quantized to 1.58-bit β€” inherits Falcon's enumeration capability,
# delivers through bitnet.cpp's fast kernels.
FROM python:3.12-slim
USER root
# System deps β€” build toolchain for bitnet.cpp, runtime libs for everything.
# libgomp1 β€” onnxruntime
# git β€” cloning bitnet.cpp repo + huggingface_hub's git backend
# g++ / clang β€” compiling bitnet.cpp's C++ kernels
# cmake β€” bitnet.cpp build system
# build-essential β€” make, etc.
# ca-certificates / curl β€” HTTPS downloads
RUN apt-get update && apt-get install -y --no-install-recommends \
libgomp1 \
git \
g++ \
clang \
cmake \
build-essential \
ca-certificates \
curl \
pkg-config \
libcurl4-openssl-dev \
libssl-dev \
&& rm -rf /var/lib/apt/lists/*
# HF Spaces convention: non-root user with UID 1000
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1
# ── bitnet.cpp clone + build ────────────────────────────────────────
# Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork).
# Pinned commit for reproducibility; bump when a new release is needed.
WORKDIR /home/user/bitnet
RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet
# Install BitNet's Python build/utility deps (gguf, huggingface_hub,
# numpy, torch-cpu, etc. β€” used by their conversion + download scripts,
# which we're about to bypass for the compile, but keep for ggml tools).
RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt
# Patch upstream const-correctness bug in ggml-bitnet-mad.cpp.
# Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;`
# because y is declared `const int8_t *` a few lines above. These
# vec-dot functions only READ y, so const-qualifying y_col is the
# safe + minimal fix. Upstream likely builds with an older compiler
# that downgraded this to a warning. Remove this patch if/when
# microsoft/BitNet fixes this in their source.
RUN sed -i \
's|int8_t \* y_col = y + col \* by;|const int8_t * y_col = y + col * by;|g' \
/home/user/bitnet/src/ggml-bitnet-mad.cpp \
&& grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp || true
# Run setup_env.py β€” REQUIRED. It codegens bitnet-lut-kernels.h before
# cmake, which cmake can't build without. The quant flag (-q i2_s /
# tl1 / tl2) drives which ternary lookup table gets generated. On
# failure, dump every cmake + compile log we can find so the next
# build iteration has signal instead of just "check details in
# logs/compile.log" (useless in a headless container).
#
# CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy
# kernel compilation doesn't OOM the HF build runner. setup_env.py
# forwards this env var through to cmake.
ENV CMAKE_BUILD_PARALLEL_LEVEL=2
# setup_env.py does three things: (1) codegens bitnet-lut-kernels.h
# from the quant flag, (2) configures+builds llama-cli via cmake,
# (3) downloads safetensors + converts to GGUF + quantizes. Only
# steps 1-2 are load-bearing for us β€” step 3 is currently broken
# upstream (convert-hf-to-gguf-bitnet.py doesn't recognize
# BitNetForCausalLM architecture), and we don't need it anyway
# because the pre-converted GGUF is published at a separate HF repo.
#
# Allow setup_env.py to fail at the convert step with `|| true`.
# Then verify the binary actually exists β€” if codegen or compile
# failed, that's still a hard failure we need to see.
RUN cd /home/user/bitnet \
&& (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s || true) \
&& (test -f /home/user/bitnet/build/bin/llama-cli || \
(echo "==================== BINARY NOT BUILT ====================" && \
find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null | \
while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null || true; done ; \
echo "==================== END LOGS ====================" && \
exit 1)) \
&& echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)"
# Download BOTH GGUF files directly from their pre-built HF repos.
# Avoids the broken local-conversion path that setup_env.py attempts.
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \
local_dir='/home/user/models/bitnet-2b-gguf', \
allow_patterns=['*.gguf']); \
snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \
local_dir='/home/user/models/falcon3-10b-gguf', \
allow_patterns=['*.gguf'])"
# Paths exposed to app.py via env vars. GGUF files now live in
# /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean
# snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf
# via recursive glob at runtime.
ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli
ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf
ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf
# ── Python app deps + repo ──────────────────────────────────────────
WORKDIR /app
# Install pure-Python deps first β€” separate layer for caching
COPY --chown=user:user requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --user -r /app/requirements.txt
# Copy repo
COPY --chown=user:user . /app
# Install vendored ng_tract wheel
RUN pip install --no-cache-dir --user --force-reinstall --no-deps \
/app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl
# Gradio on 7860
ENV GRADIO_SERVER_NAME=0.0.0.0 \
GRADIO_SERVER_PORT=7860
EXPOSE 7860
CMD ["python", "app.py"]