icarus112 commited on
Commit
d39539e
Β·
verified Β·
1 Parent(s): 6b3f06a

Upload Dockerfile with huggingface_hub

Browse files
Files changed (1) hide show
  1. Dockerfile +131 -0
Dockerfile ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
2
+
3
+ ARG HTM_CUDA_ARCH=sm_86
4
+ ARG TORCH_CUDA_ARCH_LIST=8.6
5
+
6
+ ENV DEBIAN_FRONTEND=noninteractive \
7
+ PIP_NO_CACHE_DIR=1 \
8
+ PYTHONUNBUFFERED=1 \
9
+ CARGO_HOME=/root/.cargo \
10
+ RUSTUP_HOME=/root/.rustup \
11
+ HTM_CUDA_ARCH=${HTM_CUDA_ARCH} \
12
+ TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} \
13
+ PATH=/root/.cargo/bin:${PATH}
14
+
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ git curl ca-certificates build-essential pkg-config libssl-dev && \
17
+ rm -rf /var/lib/apt/lists/*
18
+
19
+ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --profile minimal --default-toolchain stable
20
+
21
+ RUN pip install --upgrade pip setuptools wheel && \
22
+ pip install \
23
+ maturin \
24
+ huggingface_hub \
25
+ datasets \
26
+ requests \
27
+ pyarrow \
28
+ rustbpe \
29
+ pandas \
30
+ tiktoken \
31
+ pydantic \
32
+ ninja \
33
+ packaging \
34
+ einops
35
+
36
+ # Mamba-3 fused CUDA kernel stack (mandatory β€” NO fallback allowed).
37
+ #
38
+ # We install PRE-BUILT manylinux wheels from the official state-spaces/mamba
39
+ # and Dao-AILab/causal-conv1d GitHub releases. Compiling mamba_ssm from source
40
+ # on HF Spaces' cpu-basic builder (~16GB RAM) OOMKills even with MAX_JOBS=1 β€”
41
+ # nvcc on the templated selective-scan/chunk-scan kernels needs 8–12GB per TU.
42
+ #
43
+ # Wheel selection for base image pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel:
44
+ # - Python 3.11 (cp311) β€” matches PyTorch 2.6.0 image
45
+ # - CUDA 12.x wheels (cu12) β€” matches host CUDA 12.4
46
+ # - PyTorch 2.6 ABI (torch2.6) β€” exact torch match
47
+ # - cxx11abiFALSE β€” standard PyTorch pip build
48
+ #
49
+ # Versions: mamba_ssm 2.3.1 (first stable with Mamba3 class) + causal_conv1d
50
+ # 1.6.1.post4 (matching ABI). Both are CUDA-compiled, no build toolchain needed
51
+ # on the Space builder.
52
+ #
53
+ # Step A: install the published v2.3.1 prebuilt wheel (compiled CUDA ops
54
+ # for selective_scan, layernorm_gated, ssd_*, causal_conv1d, etc).
55
+ RUN pip install \
56
+ 'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.1.post4/causal_conv1d-1.6.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \
57
+ 'https://github.com/state-spaces/mamba/releases/download/v2.3.1/mamba_ssm-2.3.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' && \
58
+ python -c "import importlib.metadata as m; print('installed mamba_ssm=' + m.version('mamba_ssm') + ' causal_conv1d=' + m.version('causal_conv1d'))"
59
+
60
+ #
61
+ # Step B: graft the Mamba3 class + its pure-Triton ops subtree from mamba-ssm
62
+ # main. v2.3.1 is the latest release but Mamba3 landed post-release; the new
63
+ # files under ops/triton/mamba3/ are ALL pure Python @triton.jit kernels with
64
+ # zero compiled-CUDA dependencies (verified: every import in that subtree is
65
+ # triton/torch/python β€” no .so files, no nvcc). So we install the v2.3.1 wheel
66
+ # (for its compiled ops) and overlay the main-branch Mamba3 sources on top.
67
+ #
68
+ # This avoids the source-build OOM on the cpu-basic HF Space builder and the
69
+ # missing-file error the smoke hit on the last attempt.
70
+ # Download grafted mamba3 module + triton ops subtree
71
+ RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
72
+ BASE=https://raw.githubusercontent.com/state-spaces/mamba/main && \
73
+ curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py" && \
74
+ mkdir -p "$SITE/ops/triton/mamba3" && \
75
+ for f in __init__.py angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py mamba3_siso_step.py utils.py; do \
76
+ curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f"; \
77
+ done
78
+
79
+ # Replace mamba_ssm/__init__.py with a minimal one that only imports Mamba3
80
+ # (pure-Triton, works). The shipped __init__.py eagerly imports
81
+ # selective_scan_cuda.so which has a libtorch C++ ABI mismatch on this base
82
+ # image ("undefined symbol: _ZN3c107WarningC1E..."). Since training only needs
83
+ # Mamba3 (grafted from main), we skip all compiled-CUDA imports.
84
+ COPY mamba_ssm_init.py /opt/conda/lib/python3.11/site-packages/mamba_ssm/__init__.py
85
+
86
+ # Structural check (no triton init β€” triton has no GPU on the builder)
87
+ RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
88
+ test -f "$SITE/modules/mamba3.py" && \
89
+ test -f "$SITE/ops/triton/mamba3/mamba3_siso_combined.py" && \
90
+ test -s "$SITE/__init__.py" && \
91
+ echo "mamba3 graft + __init__ override verified"
92
+
93
+ # Optional tilelang for MIMO path β€” pure-python, cheap; SISO Mamba3 works without.
94
+ RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed β€” continuing"
95
+
96
+ # Triton version decision: FORCE 3.5.1 β€” the only version with both mamba3
97
+ # APIs (set_allocator + tl.make_tensor_descriptor). torch 2.6's _inductor
98
+ # imports AttrsDescriptor from triton.compiler.compiler which was removed in
99
+ # triton 3.4+, but mamba_ssm/__init__.py shims AttrsDescriptor as a stub
100
+ # before any torch._inductor import path runs, so the incompatibility is
101
+ # neutralized. Build-time assert verifies mamba3's two required APIs.
102
+ RUN pip install --force-reinstall --no-deps 'triton==3.5.1' && \
103
+ python -c "import triton; from triton import language as tl; \
104
+ assert hasattr(triton, 'set_allocator'), 'missing triton.set_allocator'; \
105
+ assert hasattr(tl, 'make_tensor_descriptor'), 'missing tl.make_tensor_descriptor'; \
106
+ print(f'triton={triton.__version__} set_allocator+make_tensor_descriptor OK, AttrsDescriptor shimmed in mamba_ssm/__init__.py')"
107
+
108
+ WORKDIR /workspace
109
+ COPY overlay /workspace/feather
110
+ COPY overlay/scripts /app/scripts
111
+ COPY entrypoint.py /app/entrypoint.py
112
+ WORKDIR /workspace/feather
113
+
114
+ RUN test -f /app/scripts/htm_gpu_micro_canary.py && \
115
+ python -m py_compile hydra/training.py prepare.py train.py /app/scripts/htm_gpu_micro_canary.py && \
116
+ bash -n scripts/run_domain_expanded_pretrain.sh
117
+
118
+ RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
119
+ echo "building htm_rust GPU kernels for HTM_CUDA_ARCH=${HTM_CUDA_ARCH} TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" && \
120
+ if maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml; then \
121
+ pip install htm_rust/target/wheels/htm_rust-*.whl && \
122
+ python -c "import htm_rust; assert hasattr(htm_rust, 'HTMRegionGpu'), 'htm_rust missing HTMRegionGpu GPU binding'"; \
123
+ else \
124
+ echo "[dockerfile] htm_rust GPU wheel build failed; building CPU wheel so A10 compromise/fresh-eval jobs can still run with explicit CPU fallback" && \
125
+ rm -rf htm_rust/target/wheels && \
126
+ maturin build --release --manifest-path htm_rust/Cargo.toml && \
127
+ pip install htm_rust/target/wheels/htm_rust-*.whl && \
128
+ python -c "import htm_rust; assert hasattr(htm_rust, 'HTMRegion'), 'htm_rust missing CPU HTMRegion binding'"; \
129
+ fi
130
+
131
+ CMD ["python", "/app/entrypoint.py"]