# NuWave HuggingFace Space — Docker image.
#
# Stack:
#   - BitNet b1.58 2B4T (GGUF, i2_s quant) for user-facing chat
#   - Falcon3-10B-Instruct 1.58bit (GGUF) for concept extraction
#   - bitnet.cpp (microsoft/BitNet) as the inference runtime
#   - ng_tract (Rust BTF) for substrate tracts
#   - gradio for the UI
#
# Why bitnet.cpp instead of transformers bf16: to actually deliver the
# "CPU-native ternary-weight inference" claim in NuWave's architecture
# docs. Running BitNet through transformers gives BitNet's training
# quality but none of its inference-efficiency benefits. The bitnet.cpp
# runtime uses specialized ternary kernels — ~16× memory reduction and
# major throughput gains on CPU, which is what the whole NuWave thesis
# rests on.
#
# Why Falcon3-10B for extraction: native BitNet 2B on greedy decoding
# collapses into repetition loops ("cycle, cycle, cycle") on enumeration
# tasks. Falcon3-10B-Instruct was properly instruct-tuned before being
# quantized to 1.58-bit — inherits Falcon's enumeration capability,
# delivers through bitnet.cpp's fast kernels.

FROM python:3.12-slim

USER root

# System deps — build toolchain for bitnet.cpp, runtime libs for everything.
#   libgomp1      — onnxruntime
#   git           — cloning bitnet.cpp repo + huggingface_hub's git backend
#   g++ / clang   — compiling bitnet.cpp's C++ kernels
#   cmake         — bitnet.cpp build system
#   build-essential — make, etc.
#   ca-certificates / curl — HTTPS downloads
RUN apt-get update && apt-get install -y --no-install-recommends \
        libgomp1 \
        git \
        g++ \
        clang \
        cmake \
        build-essential \
        ca-certificates \
        curl \
        pkg-config \
        libcurl4-openssl-dev \
        libssl-dev \
    && rm -rf /var/lib/apt/lists/*

# HF Spaces convention: non-root user with UID 1000
RUN useradd -m -u 1000 user
USER user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH \
    PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1

# ── bitnet.cpp clone + build ────────────────────────────────────────
# Clone microsoft/BitNet with submodules (bitnet.cpp's custom llama.cpp fork).
# Pinned commit for reproducibility; bump when a new release is needed.
WORKDIR /home/user/bitnet
RUN git clone --recursive https://github.com/microsoft/BitNet.git /home/user/bitnet

# Install BitNet's Python build/utility deps (gguf, huggingface_hub,
# numpy, torch-cpu, etc. — used by their conversion + download scripts,
# which we're about to bypass for the compile, but keep for ggml tools).
RUN pip install --no-cache-dir --user -r /home/user/bitnet/requirements.txt

# Patch upstream const-correctness bug in ggml-bitnet-mad.cpp.
# Modern gcc-14 / clang-19 rejects `int8_t * y_col = y + col * by;`
# because y is declared `const int8_t *` a few lines above. These
# vec-dot functions only READ y, so const-qualifying y_col is the
# safe + minimal fix. Upstream likely builds with an older compiler
# that downgraded this to a warning. Remove this patch if/when
# microsoft/BitNet fixes this in their source.
RUN sed -i \
    's|int8_t \* y_col = y + col \* by;|const int8_t * y_col = y + col * by;|g' \
    /home/user/bitnet/src/ggml-bitnet-mad.cpp \
    && grep -c "const int8_t \* y_col" /home/user/bitnet/src/ggml-bitnet-mad.cpp || true

# Run setup_env.py — REQUIRED. It codegens bitnet-lut-kernels.h before
# cmake, which cmake can't build without. The quant flag (-q i2_s /
# tl1 / tl2) drives which ternary lookup table gets generated. On
# failure, dump every cmake + compile log we can find so the next
# build iteration has signal instead of just "check details in
# logs/compile.log" (useless in a headless container).
#
# CMAKE_BUILD_PARALLEL_LEVEL=2 caps compile parallelism so LLVM-heavy
# kernel compilation doesn't OOM the HF build runner. setup_env.py
# forwards this env var through to cmake.
ENV CMAKE_BUILD_PARALLEL_LEVEL=2

# setup_env.py does three things: (1) codegens bitnet-lut-kernels.h
# from the quant flag, (2) configures+builds llama-cli via cmake,
# (3) downloads safetensors + converts to GGUF + quantizes. Only
# steps 1-2 are load-bearing for us — step 3 is currently broken
# upstream (convert-hf-to-gguf-bitnet.py doesn't recognize
# BitNetForCausalLM architecture), and we don't need it anyway
# because the pre-converted GGUF is published at a separate HF repo.
#
# Allow setup_env.py to fail at the convert step with `|| true`.
# Then verify the binary actually exists — if codegen or compile
# failed, that's still a hard failure we need to see.
RUN cd /home/user/bitnet \
    && (python setup_env.py --hf-repo microsoft/BitNet-b1.58-2B-4T -q i2_s || true) \
    && (test -f /home/user/bitnet/build/bin/llama-cli || \
        (echo "==================== BINARY NOT BUILT ====================" && \
         find /home/user/bitnet -name "CMakeError.log" -o -name "CMakeOutput.log" -o -name "compile.log" -o -name "*.log" 2>/dev/null | \
           while read f; do echo "=== $f ===" && cat "$f" 2>/dev/null || true; done ; \
         echo "==================== END LOGS ====================" && \
         exit 1)) \
    && echo "llama-cli built successfully: $(ls -la /home/user/bitnet/build/bin/llama-cli)"

# Download BOTH GGUF files directly from their pre-built HF repos.
# Avoids the broken local-conversion path that setup_env.py attempts.
RUN python -c "\
from huggingface_hub import snapshot_download; \
snapshot_download('microsoft/bitnet-b1.58-2B-4T-gguf', \
    local_dir='/home/user/models/bitnet-2b-gguf', \
    allow_patterns=['*.gguf']); \
snapshot_download('tiiuae/Falcon3-10B-Instruct-1.58bit-GGUF', \
    local_dir='/home/user/models/falcon3-10b-gguf', \
    allow_patterns=['*.gguf'])"

# Paths exposed to app.py via env vars. GGUF files now live in
# /home/user/models/{bitnet-2b,falcon3-10b}-gguf/ after clean
# snapshot_download. BitnetCppClient.resolve_gguf finds the .gguf
# via recursive glob at runtime.
ENV BITNET_CPP_BINARY=/home/user/bitnet/build/bin/llama-cli
ENV BITNET_CHAT_GGUF_DIR=/home/user/models/bitnet-2b-gguf
ENV FALCON_EXTRACTOR_GGUF_DIR=/home/user/models/falcon3-10b-gguf

# ── Python app deps + repo ──────────────────────────────────────────
WORKDIR /app

# Install pure-Python deps first — separate layer for caching
COPY --chown=user:user requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --user -r /app/requirements.txt

# Copy repo
COPY --chown=user:user . /app

# Install vendored ng_tract wheel
RUN pip install --no-cache-dir --user --force-reinstall --no-deps \
        /app/ng_tract-0.1.0-cp312-abi3-manylinux_2_34_x86_64.whl

# Gradio on 7860
ENV GRADIO_SERVER_NAME=0.0.0.0 \
    GRADIO_SERVER_PORT=7860
EXPOSE 7860

CMD ["python", "app.py"]