Spaces:

Executor-Tyrant-Framework
/

NuWave

Running

File size: 6,923 Bytes

e87f1a6
 
4a1690a
 
 
 
 
 
e87f1a6
4a1690a
 
 
 
 
 
 
 
 
 
 
 
 
e87f1a6
 
 
4a1690a
 
 
 
 
 
 
 
 
e87f1a6
 
 
c26fe57
4a1690a
 
 
 
 
490fa67
 
 
e87f1a6
 
4a1690a
e87f1a6
 
 
 
 
 
 
4a1690a
 
 
 
 
 
490fa67
 
 
4a1690a
 
9018cad
 
 
 
 
 
 
 
 
 
 
 
6bf194c
 
 
 
 
 
 
 
 
 
 
 
eeba176
 
 
 
 
 
 
 
 
 
 
4a1690a
eeba176
 
 
6bf194c
 
 
eeba176
 
4a1690a
eeba176
 
4a1690a
 
eeba176
 
 
4a1690a
 
 
 
eeba176
 
 
 
4a1690a
eeba176
4a1690a
 
 
e87f1a6
 
4a1690a
e87f1a6
 
 
4a1690a
e87f1a6
 
4a1690a
e87f1a6
 
 
4a1690a
e87f1a6

# NuWave HuggingFace Space — Docker image.
#
# Stack:
#   - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat
#   - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction
#   - bitnet.cpp (microsoft/BitNet) as the inference runtime
#   - ng_tract (Rust BTF) for substrate tracts
#   - gradio for the UI
#
# Why bitnet.cpp instead of transformers bf16: to actually deliver the
# "CPU-native ternary-weight inference" claim in NuWave's architecture
# docs. Running BitNet through transformers gives BitNet's training
# quality but none of its inference-efficiency benefits. The bitnet.cpp
# runtime uses specialized ternary kernels — ~16× memory reduction and
# major throughput gains on CPU, which is what the whole NuWave thesis
# rests on.
#
# Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding
# collapses into repetition loops ("cycle, cycle, cycle") on enumeration
# tasks. Falcon3-10B-Instruct was properly instruct-tuned before being
# quantized to 1.58-bit — inherits Falcon's enumeration capability,
# delivers through bitnet.cpp's fast kernels.

FROM python:3.12-slim

USER root

# System deps — build toolchain for bitnet.cpp, runtime libs for everything.
#   libgomp1      — onnxruntime
#   git           — cloning bitnet.cpp repo + huggingface_hub's git backend
#   g++ / clang   — compiling bitnet.cpp's C++ kernels
#   cmake         — bitnet.cpp build system
#   build-essential — make, etc.
#   ca-certificates / curl — HTTPS downloads
RUN apt-get update && apt-get install -y --no-install-recommends \
        libgomp1 \
        git \
        g++ \
        clang \
        cmake \
        build-essential \
        ca-certificates \
        curl \
        pkg-config \
        libcurl4-openssl-dev \
        libssl-dev \
    && rm -rf /var/lib/apt/lists/*

# HF Spaces convention: non-root user with UID 1000
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1

# ── bitnet.cpp clone + build ────────────────────────────────────────
# Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork).
# Pinned commit for reproducibility; bump when a new release is needed.
WORKDIR /home/user/bitnet
RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet

# Install BitNet's Python build/utility deps (gguf, huggingface_hub,
# numpy, torch-cpu, etc. — used by their conversion + download scripts,
# which we're about to bypass for the compile, but keep for ggml tools).
RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt

# Patch upstream const-correctness bug in ggml-bitnet-mad.cpp.
# Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;`
# because y is declared `const int8_t *` a few lines above. These
# vec-dot functions only READ y, so const-qualifying y_col is the
# safe + minimal fix. Upstream likely builds with an older compiler
# that downgraded this to a warning. Remove this patch if/when
# microsoft/BitNet fixes this in their source.
RUN sed -i \
    's|int8_t \* y_col = y + col \* by;|const int8_t * y_col = y + col * by;|g' \
    /home/user/bitnet/src/ggml-bitnet-mad.cpp \
    && grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp || true

# Run setup_env.py — REQUIRED. It codegens bitnet-lut-kernels.h before
# cmake, which cmake can't build without. The quant flag (-q i2_s /
# tl1 / tl2) drives which ternary lookup table gets generated. On
# failure, dump every cmake + compile log we can find so the next
# build iteration has signal instead of just "check details in
# logs/compile.log" (useless in a headless container).
#
# CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy
# kernel compilation doesn't OOM the HF build runner. setup_env.py
# forwards this env var through to cmake.
ENV CMAKE_BUILD_PARALLEL_LEVEL=2

# setup_env.py does three things: (1) codegens bitnet-lut-kernels.h
# from the quant flag, (2) configures+builds llama-cli via cmake,
# (3) downloads safetensors + converts to GGUF + quantizes. Only
# steps 1-2 are load-bearing for us — step 3 is currently broken
# upstream (convert-hf-to-gguf-bitnet.py doesn't recognize
# BitNetForCausalLM architecture), and we don't need it anyway
# because the pre-converted GGUF is published at a separate HF repo.
#
# Allow setup_env.py to fail at the convert step with `|| true`.
# Then verify the binary actually exists — if codegen or compile
# failed, that's still a hard failure we need to see.
RUN cd /home/user/bitnet \
    && (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s || true) \
    && (test -f /home/user/bitnet/build/bin/llama-cli || \
        (echo "==================== BINARY NOT BUILT ====================" && \
         find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null | \
           while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null || true; done ; \
         echo "==================== END LOGS ====================" && \
         exit 1)) \
    && echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)"

# Download BOTH GGUF files directly from their pre-built HF repos.
# Avoids the broken local-conversion path that setup_env.py attempts.
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \
    local_dir='/home/user/models/bitnet-2b-gguf', \
    allow_patterns=['*.gguf']); \
snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \
    local_dir='/home/user/models/falcon3-10b-gguf', \
    allow_patterns=['*.gguf'])"

# Paths exposed to app.py via env vars. GGUF files now live in
# /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean
# snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf
# via recursive glob at runtime.
ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli
ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf
ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf

# ── Python app deps + repo ──────────────────────────────────────────
WORKDIR /app

# Install pure-Python deps first — separate layer for caching
COPY --chown=user:user requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --user -r /app/requirements.txt

# Copy repo
COPY --chown=user:user . /app

# Install vendored ng_tract wheel
RUN pip install --no-cache-dir --user --force-reinstall --no-deps \
        /app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl

# Gradio on 7860
ENV GRADIO_SERVER_NAME=0.0.0.0 \
    GRADIO_SERVER_PORT=7860
EXPOSE 7860

CMD ["python", "app.py"]