Spaces:

Jackoatmon
/

feather-a10-runtime

Runtime error

App Files Files Community

feather-a10-runtime / runtime_setup.sh

Jackoatmon

Update Feather A10 Mamba3 smoke runtime image

3c48dfc verified 11 days ago

raw

history blame contribute delete

5.48 kB

	#!/usr/bin/env bash
	# Runtime setup for the stock pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel image.
	# We avoid baking feather + mamba_ssm + htm_rust into a custom Docker image
	# because build-time baking on HF's cpu-basic builder reliably corrupts CUDA
	# state on h200 runtime ("Error 802: system not yet initialized" every time,
	# even in a fresh python -c subprocess). Installing at runtime, on the h200
	# itself, avoids that path and keeps CUDA healthy.
	#
	# Trade-off: ~5-8 min cold start per job vs ~1 min for a baked image. The
	# training run is 12h long, so the overhead is negligible.

	set -euo pipefail

	echo "[runtime] $(date -u +%H:%M:%S) starting feather runtime setup on $(hostname)"

	# 1. Confirm CUDA before we do anything else.
	python -c 'import torch; assert torch.cuda.is_available(), "cuda unavailable at runtime start"; print("[runtime] cuda OK —", torch.cuda.get_device_name(0))'

	# 2. Install system build deps (rustup/build-essential for htm_rust).
	apt-get update -qq
	apt-get install -y -qq --no-install-recommends git curl ca-certificates build-essential pkg-config libssl-dev
	# Rust toolchain for htm_rust
	curl -sSf https://sh.rustup.rs \| bash -s -- -y --profile minimal --default-toolchain stable
	export PATH=/root/.cargo/bin:$PATH

	# 3. Install Python deps.
	pip install --quiet --upgrade pip setuptools wheel
	pip install --quiet \
	maturin \
	huggingface_hub \
	requests \
	pyarrow \
	rustbpe \
	pandas \
	tiktoken \
	pydantic \
	ninja \
	packaging \
	einops \
	cuda-python

	# 4. Install mamba_ssm + causal_conv1d (prebuilt wheels, matching torch2.6/cu12).
	pip install --quiet \
	'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.1.post4/causal_conv1d-1.6.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \
	'https://github.com/state-spaces/mamba/releases/download/v2.3.1/mamba_ssm-2.3.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl'

	# 5. Graft Mamba3 from the release commit compatible with torch 2.6 / Triton 3.2.
	SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm
	BASE=https://raw.githubusercontent.com/state-spaces/mamba/5235bdcd3fca41e336f17322acbfe8d8abb6c93f
	curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py"
	mkdir -p "$SITE/ops/triton/mamba3" "$SITE/ops/tilelang/mamba3" "$SITE/ops/cute/mamba3"
	for f in angle_cumsum.py k_activations.py layer_norm.py layernorm_gated.py \
	selective_state_update.py softplus.py ssd_bmm.py ssd_chunk_scan.py \
	ssd_chunk_state.py ssd_combined.py ssd_state_passing.py; do
	curl -fsSL "$BASE/mamba_ssm/ops/triton/$f" -o "$SITE/ops/triton/$f"
	done
	for f in angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py \
	mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py \
	mamba3_siso_step.py utils.py; do
	curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f"
	done
	for f in mamba3_mimo.py mamba3_mimo_bwd.py mamba3_mimo_fwd.py; do
	curl -fsSL "$BASE/mamba_ssm/ops/tilelang/mamba3/$f" -o "$SITE/ops/tilelang/mamba3/$f"
	done
	curl -fsSL "$BASE/mamba_ssm/ops/cute/mamba3/mamba3_step_fn.py" -o "$SITE/ops/cute/mamba3/mamba3_step_fn.py"
	touch "$SITE/ops/triton/mamba3/__init__.py" "$SITE/ops/tilelang/__init__.py" \
	"$SITE/ops/tilelang/mamba3/__init__.py" "$SITE/ops/cute/__init__.py" \
	"$SITE/ops/cute/mamba3/__init__.py"
	python - <<'PY'
	from pathlib import Path

	path = Path('/opt/conda/lib/python3.11/site-packages/mamba_ssm/modules/mamba3.py')
	text = path.read_text()
	text = text.replace(
	'from mamba_ssm.ops.cute.mamba3.mamba3_step_fn import mamba3_step_fn',
	'try:\n from mamba_ssm.ops.cute.mamba3.mamba3_step_fn import mamba3_step_fn\nexcept Exception:\n mamba3_step_fn = None',
	)
	text = text.replace(
	' # in_proj\n zxBCdt = self.in_proj(u)',
	' if mamba3_step_fn is None:\n raise RuntimeError("Mamba3 step() requires optional CUTLASS/CuTe dependencies")\n\n # in_proj\n zxBCdt = self.in_proj(u)',
	)
	path.write_text(text)
	PY
	cp /workspace/feather/hf_jobs/feather_h200_image/mamba3_siso_combined_torch_fallback.py \
	"$SITE/ops/triton/mamba3/mamba3_siso_combined.py"
	python -m py_compile "$SITE/ops/triton/mamba3/mamba3_siso_combined.py"
	# torch 2.6-compatible Triton; newer Triton breaks A10 driver discovery on HF Jobs.
	pip install --quiet --force-reinstall --no-deps 'triton==3.2.0'
	# Replace the eager-init __init__.py with our minimal version.
	cp /workspace/feather/hf_jobs/feather_h200_image/mamba_ssm_init.py "$SITE/__init__.py"

	# 6. Confirm CUDA still works after all installs.
	python -c 'import torch; assert torch.cuda.is_available(), "cuda broken by installs"; print("[runtime] cuda OK after deps —", torch.cuda.get_device_name(0))'

	# 7. Build + install htm_rust. Default is A10G sm_86; override for H200 with HTM_CUDA_ARCH=sm_90.
	cd /workspace/feather
	export HTM_CUDA_ARCH=${HTM_CUDA_ARCH:-sm_86}
	export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}
	maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml 2>&1 \| tail -5
	pip install --quiet htm_rust/target/wheels/htm_rust-*.whl

	# 8. Sanity: cuda still alive after htm_rust install.
	python -c 'import torch; assert torch.cuda.is_available(), "cuda broken by htm_rust"; import htm_rust; print("[runtime] htm_rust OK, cuda OK")'

	echo "[runtime] $(date -u +%H:%M:%S) runtime setup complete"