MINDI-1.5-Vision-Coder / setup_mi300x.sh

Faaz

Remove HSA_OVERRIDE_GFX_VERSION - ROCm 7.0 native MI300X support

4a33f96 about 1 month ago

6.17 kB

	#!/bin/bash
	# ============================================================
	# MINDI 1.5 Vision-Coder — MI300X Setup Script
	# Run INSIDE the Docker container on DigitalOcean AMD MI300X
	#
	# On the host first:
	# docker exec -it rocm /bin/bash
	# export HF_TOKEN=hf_your_token_here
	# bash setup_mi300x.sh (if already cloned)
	# OR wget + bash (if fresh)
	# ============================================================
	set -e

	echo "============================================================"
	echo " MINDI 1.5 Vision-Coder — MI300X Setup"
	echo " MINDIGENOUS.AI"
	echo " (Docker container environment)"
	echo "============================================================"
	echo ""

	# ── Check HF_TOKEN ─────────────────────────────────────────────
	if [ -z "$HF_TOKEN" ]; then
	echo "ERROR: Set HF_TOKEN environment variable first!"
	echo " export HF_TOKEN=hf_your_token_here"
	exit 1
	fi

	# ── Step 1: Verify PyTorch + ROCm (already in Docker image) ───
	echo "[1/7] Verifying PyTorch + ROCm (pre-installed in Docker) ..."
	python3 -c "
	import torch
	v = torch.__version__
	hip = torch.version.hip or 'None'
	print(f' PyTorch: {v}')
	print(f' ROCm/HIP: {hip}')
	assert torch.cuda.is_available(), 'No GPU detected!'
	print(f' GPU: {torch.cuda.get_device_name(0)}')
	vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
	print(f' VRAM: {vram:.0f} GB')
	print(' [OK] PyTorch + ROCm verified')
	"

	# ── Step 2: Get the full project from HF ──────────────────────
	echo ""
	echo "[2/7] Getting MINDI 1.5 from HuggingFace ..."
	if [ -f "requirements.txt" ]; then
	echo " Already in repo — pulling latest ..."
	git pull
	else
	git clone https://Mindigenous:${HF_TOKEN}@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder
	cd MINDI-1.5-Vision-Coder
	fi

	# ── Step 3: Install Python requirements ────────────────────────
	echo ""
	echo "[3/7] Installing Python requirements ..."
	pip install -r requirements.txt

	# Additional training dependencies
	pip install wandb huggingface_hub accelerate

	# ── Step 4: Download training data from HF ─────────────────────
	echo ""
	echo "[4/7] Downloading training dataset ..."
	python3 -c "
	from huggingface_hub import snapshot_download
	import os

	snapshot_download(
	repo_id='Mindigenous/MINDI-1.5-training-data',
	repo_type='dataset',
	local_dir='data/',
	token=os.environ['HF_TOKEN']
	)
	print('Dataset downloaded!')
	"

	# Verify data files exist
	echo " Checking data files ..."
	if [ ! -f "data/processed/train.jsonl" ]; then
	echo " ERROR: train.jsonl not found!"
	exit 1
	fi
	if [ ! -f "data/processed/val.jsonl" ]; then
	echo " ERROR: val.jsonl not found!"
	exit 1
	fi
	TRAIN_SIZE=$(du -sh data/processed/train.jsonl \| cut -f1)
	VAL_SIZE=$(du -sh data/processed/val.jsonl \| cut -f1)
	echo " train.jsonl: ${TRAIN_SIZE}"
	echo " val.jsonl: ${VAL_SIZE}"

	# ── Step 5: Set environment variables ──────────────────────────
	echo ""
	echo "[5/7] Setting environment variables ..."

	# ROCm / PyTorch settings for MI300X
	# NOTE: Do NOT set HSA_OVERRIDE_GFX_VERSION — ROCm 7.0 has native gfx942 support
	export PYTORCH_ROCM_ARCH="gfx942"
	export HIP_VISIBLE_DEVICES=0
	export TOKENIZERS_PARALLELISM=false
	export WANDB_PROJECT="mindi-1.5-vision-coder"

	# Create .env file for the project
	cat > .env << EOF
	HF_TOKEN=${HF_TOKEN}
	PYTORCH_ROCM_ARCH=gfx942
	HIP_VISIBLE_DEVICES=0
	TOKENIZERS_PARALLELISM=false
	WANDB_PROJECT=mindi-1.5-vision-coder
	EOF

	# Also add to bashrc so env persists across docker exec sessions
	grep -q "HSA_OVERRIDE_GFX_VERSION" ~/.bashrc 2>/dev/null \|\| cat >> ~/.bashrc << 'ENVEOF'

	# MINDI 1.5 MI300X environment
	export PYTORCH_ROCM_ARCH=gfx942
	export HIP_VISIBLE_DEVICES=0
	export TOKENIZERS_PARALLELISM=false
	export WANDB_PROJECT=mindi-1.5-vision-coder
	ENVEOF
	echo " .env file created + bashrc updated"

	# ── Step 6: GPU stress test ────────────────────────────────────
	echo ""
	echo "[6/7] Running GPU verification + bf16 test ..."
	python3 -c "
	import torch
	print(f' PyTorch version: {torch.__version__}')
	print(f' CUDA available: {torch.cuda.is_available()}')
	if torch.cuda.is_available():
	print(f' GPU name: {torch.cuda.get_device_name(0)}')
	vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
	print(f' VRAM: {vram:.0f} GB')
	print(f' ROCm backend: {torch.version.hip is not None}')
	# bf16 matmul test
	x = torch.randn(1000, 1000, dtype=torch.bfloat16, device='cuda')
	y = torch.matmul(x, x.T)
	print(f' bf16 matmul: PASSED (shape={y.shape})')
	# Memory allocation test
	big = torch.zeros(1024, 1024, 1024, dtype=torch.bfloat16, device='cuda') # ~2GB
	print(f' 2GB alloc test: PASSED')
	del big
	torch.cuda.empty_cache()
	else:
	print(' ERROR: No GPU detected!')
	exit(1)
	"

	# ── Step 7: Create output directories ─────────────────────────
	echo ""
	echo "[7/7] Creating output directories ..."
	mkdir -p checkpoints/training
	mkdir -p checkpoints/best
	mkdir -p logs/training

	# ── Done ───────────────────────────────────────────────────────
	echo ""
	echo "============================================================"
	echo " MINDI 1.5 Vision-Coder — MI300X Ready!"
	echo ""
	echo " Project: $(pwd)"
	echo " Data: ${TRAIN_SIZE} train / ${VAL_SIZE} val"
	echo " GPU: $(python -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null \|\| echo 'N/A')"
	echo ""
	echo " Ready to train!"
	echo " Run: python scripts/train.py --phase 1"
	echo ""
	echo " Or dry run first:"
	echo " Run: python scripts/train.py --dry_run --no_wandb"
	echo "============================================================"