Spaces:

Executor-Tyrant-Framework
/

NuWave

Runtime error

App Files Files Community

NuWave / Dockerfile

Executor-Tyrant-Framework

Sync from GitHub: aedae31208bb57c61524b2d4f0fcd49d7bd2afba

eeba176 verified 3 months ago

Raw

History Blame Contribute Delete

6.92 kB

	# NuWave HuggingFace Space — Docker image.
	#
	# Stack:
	# - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat
	# - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction
	# - bitnet.cpp (microsoft/BitNet) as the inference runtime
	# - ng_tract (Rust BTF) for substrate tracts
	# - gradio for the UI
	#
	# Why bitnet.cpp instead of transformers bf16: to actually deliver the
	# "CPU-native ternary-weight inference" claim in NuWave's architecture
	# docs. Running BitNet through transformers gives BitNet's training
	# quality but none of its inference-efficiency benefits. The bitnet.cpp
	# runtime uses specialized ternary kernels — ~16× memory reduction and
	# major throughput gains on CPU, which is what the whole NuWave thesis
	# rests on.
	#
	# Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding
	# collapses into repetition loops ("cycle, cycle, cycle") on enumeration
	# tasks. Falcon3-10B-Instruct was properly instruct-tuned before being
	# quantized to 1.58-bit — inherits Falcon's enumeration capability,
	# delivers through bitnet.cpp's fast kernels.

	FROM python:3.12-slim

	USER root

	# System deps — build toolchain for bitnet.cpp, runtime libs for everything.
	# libgomp1 — onnxruntime
	# git — cloning bitnet.cpp repo + huggingface_hub's git backend
	# g++ / clang — compiling bitnet.cpp's C++ kernels
	# cmake — bitnet.cpp build system
	# build-essential — make, etc.
	# ca-certificates / curl — HTTPS downloads
	RUN apt-get update && apt-get install -y --no-install-recommends \
	libgomp1 \
	git \
	g++ \
	clang \
	cmake \
	build-essential \
	ca-certificates \
	curl \
	pkg-config \
	libcurl4-openssl-dev \
	libssl-dev \
	&& rm -rf /var/lib/apt/lists/*

	# HF Spaces convention: non-root user with UID 1000
	RUN useradd -m -u 1000 user
	USER user
	ENV HOME=/home/user \
	PATH=/home/user/.local/bin:$PATH \
	PYTHONDONTWRITEBYTECODE=1 \
	PYTHONUNBUFFERED=1

	# ── bitnet.cpp clone + build ────────────────────────────────────────
	# Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork).
	# Pinned commit for reproducibility; bump when a new release is needed.
	WORKDIR /home/user/bitnet
	RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet

	# Install BitNet's Python build/utility deps (gguf, huggingface_hub,
	# numpy, torch-cpu, etc. — used by their conversion + download scripts,
	# which we're about to bypass for the compile, but keep for ggml tools).
	RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt

	# Patch upstream const-correctness bug in ggml-bitnet-mad.cpp.
	# Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;`
	# because y is declared `const int8_t *` a few lines above. These
	# vec-dot functions only READ y, so const-qualifying y_col is the
	# safe + minimal fix. Upstream likely builds with an older compiler
	# that downgraded this to a warning. Remove this patch if/when
	# microsoft/BitNet fixes this in their source.
	RUN sed -i \
	's\|int8_t \* y_col = y + col \* by;\|const int8_t * y_col = y + col * by;\|g' \
	/home/user/bitnet/src/ggml-bitnet-mad.cpp \
	&& grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp \|\| true

	# Run setup_env.py — REQUIRED. It codegens bitnet-lut-kernels.h before
	# cmake, which cmake can't build without. The quant flag (-q i2_s /
	# tl1 / tl2) drives which ternary lookup table gets generated. On
	# failure, dump every cmake + compile log we can find so the next
	# build iteration has signal instead of just "check details in
	# logs/compile.log" (useless in a headless container).
	#
	# CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy
	# kernel compilation doesn't OOM the HF build runner. setup_env.py
	# forwards this env var through to cmake.
	ENV CMAKE_BUILD_PARALLEL_LEVEL=2

	# setup_env.py does three things: (1) codegens bitnet-lut-kernels.h
	# from the quant flag, (2) configures+builds llama-cli via cmake,
	# (3) downloads safetensors + converts to GGUF + quantizes. Only
	# steps 1-2 are load-bearing for us — step 3 is currently broken
	# upstream (convert-hf-to-gguf-bitnet.py doesn't recognize
	# BitNetForCausalLM architecture), and we don't need it anyway
	# because the pre-converted GGUF is published at a separate HF repo.
	#
	# Allow setup_env.py to fail at the convert step with `\|\| true`.
	# Then verify the binary actually exists — if codegen or compile
	# failed, that's still a hard failure we need to see.
	RUN cd /home/user/bitnet \
	&& (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s \|\| true) \
	&& (test -f /home/user/bitnet/build/bin/llama-cli \|\| \
	(echo "==================== BINARY NOT BUILT ====================" && \
	find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null \| \
	while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null \|\| true; done ; \
	echo "==================== END LOGS ====================" && \
	exit 1)) \
	&& echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)"

	# Download BOTH GGUF files directly from their pre-built HF repos.
	# Avoids the broken local-conversion path that setup_env.py attempts.
	RUN python -c "\
	from huggingface_hub import snapshot_download; \
	snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \
	local_dir='/home/user/models/bitnet-2b-gguf', \
	allow_patterns=['*.gguf']); \
	snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \
	local_dir='/home/user/models/falcon3-10b-gguf', \
	allow_patterns=['*.gguf'])"

	# Paths exposed to app.py via env vars. GGUF files now live in
	# /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean
	# snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf
	# via recursive glob at runtime.
	ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli
	ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf
	ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf

	# ── Python app deps + repo ──────────────────────────────────────────
	WORKDIR /app

	# Install pure-Python deps first — separate layer for caching
	COPY --chown=user:user requirements.txt /app/requirements.txt
	RUN pip install --no-cache-dir --user -r /app/requirements.txt

	# Copy repo
	COPY --chown=user:user . /app

	# Install vendored ng_tract wheel
	RUN pip install --no-cache-dir --user --force-reinstall --no-deps \
	/app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl

	# Gradio on 7860
	ENV GRADIO_SERVER_NAME=0.0.0.0 \
	GRADIO_SERVER_PORT=7860
	EXPOSE 7860

	CMD ["python", "app.py"]