Upload Dockerfile with huggingface_hub
Browse files- Dockerfile +131 -0
Dockerfile
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
|
| 2 |
+
|
| 3 |
+
ARG HTM_CUDA_ARCH=sm_86
|
| 4 |
+
ARG TORCH_CUDA_ARCH_LIST=8.6
|
| 5 |
+
|
| 6 |
+
ENV DEBIAN_FRONTEND=noninteractive \
|
| 7 |
+
PIP_NO_CACHE_DIR=1 \
|
| 8 |
+
PYTHONUNBUFFERED=1 \
|
| 9 |
+
CARGO_HOME=/root/.cargo \
|
| 10 |
+
RUSTUP_HOME=/root/.rustup \
|
| 11 |
+
HTM_CUDA_ARCH=${HTM_CUDA_ARCH} \
|
| 12 |
+
TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} \
|
| 13 |
+
PATH=/root/.cargo/bin:${PATH}
|
| 14 |
+
|
| 15 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 16 |
+
git curl ca-certificates build-essential pkg-config libssl-dev && \
|
| 17 |
+
rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --profile minimal --default-toolchain stable
|
| 20 |
+
|
| 21 |
+
RUN pip install --upgrade pip setuptools wheel && \
|
| 22 |
+
pip install \
|
| 23 |
+
maturin \
|
| 24 |
+
huggingface_hub \
|
| 25 |
+
datasets \
|
| 26 |
+
requests \
|
| 27 |
+
pyarrow \
|
| 28 |
+
rustbpe \
|
| 29 |
+
pandas \
|
| 30 |
+
tiktoken \
|
| 31 |
+
pydantic \
|
| 32 |
+
ninja \
|
| 33 |
+
packaging \
|
| 34 |
+
einops
|
| 35 |
+
|
| 36 |
+
# Mamba-3 fused CUDA kernel stack (mandatory β NO fallback allowed).
|
| 37 |
+
#
|
| 38 |
+
# We install PRE-BUILT manylinux wheels from the official state-spaces/mamba
|
| 39 |
+
# and Dao-AILab/causal-conv1d GitHub releases. Compiling mamba_ssm from source
|
| 40 |
+
# on HF Spaces' cpu-basic builder (~16GB RAM) OOMKills even with MAX_JOBS=1 β
|
| 41 |
+
# nvcc on the templated selective-scan/chunk-scan kernels needs 8β12GB per TU.
|
| 42 |
+
#
|
| 43 |
+
# Wheel selection for base image pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel:
|
| 44 |
+
# - Python 3.11 (cp311) β matches PyTorch 2.6.0 image
|
| 45 |
+
# - CUDA 12.x wheels (cu12) β matches host CUDA 12.4
|
| 46 |
+
# - PyTorch 2.6 ABI (torch2.6) β exact torch match
|
| 47 |
+
# - cxx11abiFALSE β standard PyTorch pip build
|
| 48 |
+
#
|
| 49 |
+
# Versions: mamba_ssm 2.3.1 (first stable with Mamba3 class) + causal_conv1d
|
| 50 |
+
# 1.6.1.post4 (matching ABI). Both are CUDA-compiled, no build toolchain needed
|
| 51 |
+
# on the Space builder.
|
| 52 |
+
#
|
| 53 |
+
# Step A: install the published v2.3.1 prebuilt wheel (compiled CUDA ops
|
| 54 |
+
# for selective_scan, layernorm_gated, ssd_*, causal_conv1d, etc).
|
| 55 |
+
RUN pip install \
|
| 56 |
+
'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.1.post4/causal_conv1d-1.6.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \
|
| 57 |
+
'https://github.com/state-spaces/mamba/releases/download/v2.3.1/mamba_ssm-2.3.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' && \
|
| 58 |
+
python -c "import importlib.metadata as m; print('installed mamba_ssm=' + m.version('mamba_ssm') + ' causal_conv1d=' + m.version('causal_conv1d'))"
|
| 59 |
+
|
| 60 |
+
#
|
| 61 |
+
# Step B: graft the Mamba3 class + its pure-Triton ops subtree from mamba-ssm
|
| 62 |
+
# main. v2.3.1 is the latest release but Mamba3 landed post-release; the new
|
| 63 |
+
# files under ops/triton/mamba3/ are ALL pure Python @triton.jit kernels with
|
| 64 |
+
# zero compiled-CUDA dependencies (verified: every import in that subtree is
|
| 65 |
+
# triton/torch/python β no .so files, no nvcc). So we install the v2.3.1 wheel
|
| 66 |
+
# (for its compiled ops) and overlay the main-branch Mamba3 sources on top.
|
| 67 |
+
#
|
| 68 |
+
# This avoids the source-build OOM on the cpu-basic HF Space builder and the
|
| 69 |
+
# missing-file error the smoke hit on the last attempt.
|
| 70 |
+
# Download grafted mamba3 module + triton ops subtree
|
| 71 |
+
RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
|
| 72 |
+
BASE=https://raw.githubusercontent.com/state-spaces/mamba/main && \
|
| 73 |
+
curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py" && \
|
| 74 |
+
mkdir -p "$SITE/ops/triton/mamba3" && \
|
| 75 |
+
for f in __init__.py angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py mamba3_siso_step.py utils.py; do \
|
| 76 |
+
curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f"; \
|
| 77 |
+
done
|
| 78 |
+
|
| 79 |
+
# Replace mamba_ssm/__init__.py with a minimal one that only imports Mamba3
|
| 80 |
+
# (pure-Triton, works). The shipped __init__.py eagerly imports
|
| 81 |
+
# selective_scan_cuda.so which has a libtorch C++ ABI mismatch on this base
|
| 82 |
+
# image ("undefined symbol: _ZN3c107WarningC1E..."). Since training only needs
|
| 83 |
+
# Mamba3 (grafted from main), we skip all compiled-CUDA imports.
|
| 84 |
+
COPY mamba_ssm_init.py /opt/conda/lib/python3.11/site-packages/mamba_ssm/__init__.py
|
| 85 |
+
|
| 86 |
+
# Structural check (no triton init β triton has no GPU on the builder)
|
| 87 |
+
RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
|
| 88 |
+
test -f "$SITE/modules/mamba3.py" && \
|
| 89 |
+
test -f "$SITE/ops/triton/mamba3/mamba3_siso_combined.py" && \
|
| 90 |
+
test -s "$SITE/__init__.py" && \
|
| 91 |
+
echo "mamba3 graft + __init__ override verified"
|
| 92 |
+
|
| 93 |
+
# Optional tilelang for MIMO path β pure-python, cheap; SISO Mamba3 works without.
|
| 94 |
+
RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed β continuing"
|
| 95 |
+
|
| 96 |
+
# Triton version decision: FORCE 3.5.1 β the only version with both mamba3
|
| 97 |
+
# APIs (set_allocator + tl.make_tensor_descriptor). torch 2.6's _inductor
|
| 98 |
+
# imports AttrsDescriptor from triton.compiler.compiler which was removed in
|
| 99 |
+
# triton 3.4+, but mamba_ssm/__init__.py shims AttrsDescriptor as a stub
|
| 100 |
+
# before any torch._inductor import path runs, so the incompatibility is
|
| 101 |
+
# neutralized. Build-time assert verifies mamba3's two required APIs.
|
| 102 |
+
RUN pip install --force-reinstall --no-deps 'triton==3.5.1' && \
|
| 103 |
+
python -c "import triton; from triton import language as tl; \
|
| 104 |
+
assert hasattr(triton, 'set_allocator'), 'missing triton.set_allocator'; \
|
| 105 |
+
assert hasattr(tl, 'make_tensor_descriptor'), 'missing tl.make_tensor_descriptor'; \
|
| 106 |
+
print(f'triton={triton.__version__} set_allocator+make_tensor_descriptor OK, AttrsDescriptor shimmed in mamba_ssm/__init__.py')"
|
| 107 |
+
|
| 108 |
+
WORKDIR /workspace
|
| 109 |
+
COPY overlay /workspace/feather
|
| 110 |
+
COPY overlay/scripts /app/scripts
|
| 111 |
+
COPY entrypoint.py /app/entrypoint.py
|
| 112 |
+
WORKDIR /workspace/feather
|
| 113 |
+
|
| 114 |
+
RUN test -f /app/scripts/htm_gpu_micro_canary.py && \
|
| 115 |
+
python -m py_compile hydra/training.py prepare.py train.py /app/scripts/htm_gpu_micro_canary.py && \
|
| 116 |
+
bash -n scripts/run_domain_expanded_pretrain.sh
|
| 117 |
+
|
| 118 |
+
RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
|
| 119 |
+
echo "building htm_rust GPU kernels for HTM_CUDA_ARCH=${HTM_CUDA_ARCH} TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" && \
|
| 120 |
+
if maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml; then \
|
| 121 |
+
pip install htm_rust/target/wheels/htm_rust-*.whl && \
|
| 122 |
+
python -c "import htm_rust; assert hasattr(htm_rust, 'HTMRegionGpu'), 'htm_rust missing HTMRegionGpu GPU binding'"; \
|
| 123 |
+
else \
|
| 124 |
+
echo "[dockerfile] htm_rust GPU wheel build failed; building CPU wheel so A10 compromise/fresh-eval jobs can still run with explicit CPU fallback" && \
|
| 125 |
+
rm -rf htm_rust/target/wheels && \
|
| 126 |
+
maturin build --release --manifest-path htm_rust/Cargo.toml && \
|
| 127 |
+
pip install htm_rust/target/wheels/htm_rust-*.whl && \
|
| 128 |
+
python -c "import htm_rust; assert hasattr(htm_rust, 'HTMRegion'), 'htm_rust missing CPU HTMRegion binding'"; \
|
| 129 |
+
fi
|
| 130 |
+
|
| 131 |
+
CMD ["python", "/app/entrypoint.py"]
|