Spaces:
Runtime error
Runtime error
| # Runtime setup for the stock pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel image. | |
| # We avoid baking feather + mamba_ssm + htm_rust into a custom Docker image | |
| # because build-time baking on HF's cpu-basic builder reliably corrupts CUDA | |
| # state on h200 runtime ("Error 802: system not yet initialized" every time, | |
| # even in a fresh python -c subprocess). Installing at runtime, on the h200 | |
| # itself, avoids that path and keeps CUDA healthy. | |
| # | |
| # Trade-off: ~5-8 min cold start per job vs ~1 min for a baked image. The | |
| # training run is 12h long, so the overhead is negligible. | |
| set -euo pipefail | |
| echo "[runtime] $(date -u +%H:%M:%S) starting feather runtime setup on $(hostname)" | |
| # 1. Confirm CUDA before we do anything else. | |
| python -c 'import torch; assert torch.cuda.is_available(), "cuda unavailable at runtime start"; print("[runtime] cuda OK —", torch.cuda.get_device_name(0))' | |
| # 2. Install system build deps (rustup/build-essential for htm_rust). | |
| apt-get update -qq | |
| apt-get install -y -qq --no-install-recommends git curl ca-certificates build-essential pkg-config libssl-dev | |
| # Rust toolchain for htm_rust | |
| curl -sSf https://sh.rustup.rs | bash -s -- -y --profile minimal --default-toolchain stable | |
| export PATH=/root/.cargo/bin:$PATH | |
| # 3. Install Python deps. | |
| pip install --quiet --upgrade pip setuptools wheel | |
| pip install --quiet \ | |
| maturin \ | |
| huggingface_hub \ | |
| requests \ | |
| pyarrow \ | |
| rustbpe \ | |
| pandas \ | |
| tiktoken \ | |
| pydantic \ | |
| ninja \ | |
| packaging \ | |
| einops | |
| # 4. Install mamba_ssm + causal_conv1d (prebuilt wheels, matching torch2.6/cu12). | |
| pip install --quiet \ | |
| 'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.1.post4/causal_conv1d-1.6.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \ | |
| 'https://github.com/state-spaces/mamba/releases/download/v2.3.1/mamba_ssm-2.3.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' | |
| # 5. Graft Mamba3 from main (pure Triton, not in v2.3.1 release). | |
| SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm | |
| BASE=https://raw.githubusercontent.com/state-spaces/mamba/main | |
| curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py" | |
| mkdir -p "$SITE/ops/triton/mamba3" | |
| for f in __init__.py angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py \ | |
| mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py \ | |
| mamba3_siso_step.py utils.py; do | |
| curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f" | |
| done | |
| # Replace the eager-init __init__.py with our minimal version. | |
| cp /workspace/feather/hf_jobs/feather_h200_image/mamba_ssm_init.py "$SITE/__init__.py" | |
| # 6. Confirm CUDA still works after all installs. | |
| python -c 'import torch; assert torch.cuda.is_available(), "cuda broken by installs"; print("[runtime] cuda OK after deps —", torch.cuda.get_device_name(0))' | |
| # 7. Build + install htm_rust with sm_90 PTX (h200 arch). | |
| cd /workspace/feather | |
| export HTM_CUDA_ARCH=sm_90 | |
| export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} | |
| maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml 2>&1 | tail -5 | |
| pip install --quiet htm_rust/target/wheels/htm_rust-*.whl | |
| # 8. Sanity: cuda still alive after htm_rust install. | |
| python -c 'import torch; assert torch.cuda.is_available(), "cuda broken by htm_rust"; import htm_rust; print("[runtime] htm_rust OK, cuda OK")' | |
| echo "[runtime] $(date -u +%H:%M:%S) runtime setup complete" | |