File size: 6,923 Bytes
e87f1a6
 
4a1690a
 
 
 
 
 
e87f1a6
4a1690a
 
 
 
 
 
 
 
 
 
 
 
 
e87f1a6
 
 
4a1690a
 
 
 
 
 
 
 
 
e87f1a6
 
 
c26fe57
4a1690a
 
 
 
 
490fa67
 
 
e87f1a6
 
4a1690a
e87f1a6
 
 
 
 
 
 
4a1690a
 
 
 
 
 
490fa67
 
 
4a1690a
 
9018cad
 
 
 
 
 
 
 
 
 
 
 
6bf194c
 
 
 
 
 
 
 
 
 
 
 
eeba176
 
 
 
 
 
 
 
 
 
 
4a1690a
eeba176
 
 
6bf194c
 
 
eeba176
 
4a1690a
eeba176
 
4a1690a
 
eeba176
 
 
4a1690a
 
 
 
eeba176
 
 
 
4a1690a
eeba176
4a1690a
 
 
e87f1a6
 
4a1690a
e87f1a6
 
 
4a1690a
e87f1a6
 
4a1690a
e87f1a6
 
 
4a1690a
e87f1a6
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# NuWave HuggingFace Space β€” Docker image.
#
# Stack:
#   - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat
#   - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction
#   - bitnet.cpp (microsoft/BitNet) as the inference runtime
#   - ng_tract (Rust BTF) for substrate tracts
#   - gradio for the UI
#
# Why bitnet.cpp instead of transformers bf16: to actually deliver the
# "CPU-native ternary-weight inference" claim in NuWave's architecture
# docs. Running BitNet through transformers gives BitNet's training
# quality but none of its inference-efficiency benefits. The bitnet.cpp
# runtime uses specialized ternary kernels β€” ~16Γ— memory reduction and
# major throughput gains on CPU, which is what the whole NuWave thesis
# rests on.
#
# Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding
# collapses into repetition loops ("cycle, cycle, cycle") on enumeration
# tasks. Falcon3-10B-Instruct was properly instruct-tuned before being
# quantized to 1.58-bit β€” inherits Falcon's enumeration capability,
# delivers through bitnet.cpp's fast kernels.

FROM python:3.12-slim

USER root

# System deps β€” build toolchain for bitnet.cpp, runtime libs for everything.
#   libgomp1      β€” onnxruntime
#   git           β€” cloning bitnet.cpp repo + huggingface_hub's git backend
#   g++ / clang   β€” compiling bitnet.cpp's C++ kernels
#   cmake         β€” bitnet.cpp build system
#   build-essential β€” make, etc.
#   ca-certificates / curl β€” HTTPS downloads
RUN apt-get update && apt-get install -y --no-install-recommends \
        libgomp1 \
        git \
        g++ \
        clang \
        cmake \
        build-essential \
        ca-certificates \
        curl \
        pkg-config \
        libcurl4-openssl-dev \
        libssl-dev \
    && rm -rf /var/lib/apt/lists/*

# HF Spaces convention: non-root user with UID 1000
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1

# ── bitnet.cpp clone + build ────────────────────────────────────────
# Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork).
# Pinned commit for reproducibility; bump when a new release is needed.
WORKDIR /home/user/bitnet
RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet

# Install BitNet's Python build/utility deps (gguf, huggingface_hub,
# numpy, torch-cpu, etc. β€” used by their conversion + download scripts,
# which we're about to bypass for the compile, but keep for ggml tools).
RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt

# Patch upstream const-correctness bug in ggml-bitnet-mad.cpp.
# Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;`
# because y is declared `const int8_t *` a few lines above. These
# vec-dot functions only READ y, so const-qualifying y_col is the
# safe + minimal fix. Upstream likely builds with an older compiler
# that downgraded this to a warning. Remove this patch if/when
# microsoft/BitNet fixes this in their source.
RUN sed -i \
    's|int8_t \* y_col = y + col \* by;|const int8_t * y_col = y + col * by;|g' \
    /home/user/bitnet/src/ggml-bitnet-mad.cpp \
    && grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp || true

# Run setup_env.py β€” REQUIRED. It codegens bitnet-lut-kernels.h before
# cmake, which cmake can't build without. The quant flag (-q i2_s /
# tl1 / tl2) drives which ternary lookup table gets generated. On
# failure, dump every cmake + compile log we can find so the next
# build iteration has signal instead of just "check details in
# logs/compile.log" (useless in a headless container).
#
# CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy
# kernel compilation doesn't OOM the HF build runner. setup_env.py
# forwards this env var through to cmake.
ENV CMAKE_BUILD_PARALLEL_LEVEL=2

# setup_env.py does three things: (1) codegens bitnet-lut-kernels.h
# from the quant flag, (2) configures+builds llama-cli via cmake,
# (3) downloads safetensors + converts to GGUF + quantizes. Only
# steps 1-2 are load-bearing for us β€” step 3 is currently broken
# upstream (convert-hf-to-gguf-bitnet.py doesn't recognize
# BitNetForCausalLM architecture), and we don't need it anyway
# because the pre-converted GGUF is published at a separate HF repo.
#
# Allow setup_env.py to fail at the convert step with `|| true`.
# Then verify the binary actually exists β€” if codegen or compile
# failed, that's still a hard failure we need to see.
RUN cd /home/user/bitnet \
    && (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s || true) \
    && (test -f /home/user/bitnet/build/bin/llama-cli || \
        (echo "==================== BINARY NOT BUILT ====================" && \
         find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null | \
           while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null || true; done ; \
         echo "==================== END LOGS ====================" && \
         exit 1)) \
    && echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)"

# Download BOTH GGUF files directly from their pre-built HF repos.
# Avoids the broken local-conversion path that setup_env.py attempts.
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \
    local_dir='/home/user/models/bitnet-2b-gguf', \
    allow_patterns=['*.gguf']); \
snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \
    local_dir='/home/user/models/falcon3-10b-gguf', \
    allow_patterns=['*.gguf'])"

# Paths exposed to app.py via env vars. GGUF files now live in
# /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean
# snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf
# via recursive glob at runtime.
ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli
ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf
ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf

# ── Python app deps + repo ──────────────────────────────────────────
WORKDIR /app

# Install pure-Python deps first β€” separate layer for caching
COPY --chown=user:user requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --user -r /app/requirements.txt

# Copy repo
COPY --chown=user:user . /app

# Install vendored ng_tract wheel
RUN pip install --no-cache-dir --user --force-reinstall --no-deps \
        /app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl

# Gradio on 7860
ENV GRADIO_SERVER_NAME=0.0.0.0 \
    GRADIO_SERVER_PORT=7860
EXPOSE 7860

CMD ["python", "app.py"]