MINDI-1.5-Vision-Coder / setup_mi300x.sh
Faaz
Remove HSA_OVERRIDE_GFX_VERSION - ROCm 7.0 native MI300X support
4a33f96
#!/bin/bash
# ============================================================
# MINDI 1.5 Vision-Coder β€” MI300X Setup Script
# Run INSIDE the Docker container on DigitalOcean AMD MI300X
#
# On the host first:
# docker exec -it rocm /bin/bash
# export HF_TOKEN=hf_your_token_here
# bash setup_mi300x.sh (if already cloned)
# OR wget + bash (if fresh)
# ============================================================
set -e
echo "============================================================"
echo " MINDI 1.5 Vision-Coder β€” MI300X Setup"
echo " MINDIGENOUS.AI"
echo " (Docker container environment)"
echo "============================================================"
echo ""
# ── Check HF_TOKEN ─────────────────────────────────────────────
if [ -z "$HF_TOKEN" ]; then
echo "ERROR: Set HF_TOKEN environment variable first!"
echo " export HF_TOKEN=hf_your_token_here"
exit 1
fi
# ── Step 1: Verify PyTorch + ROCm (already in Docker image) ───
echo "[1/7] Verifying PyTorch + ROCm (pre-installed in Docker) ..."
python3 -c "
import torch
v = torch.__version__
hip = torch.version.hip or 'None'
print(f' PyTorch: {v}')
print(f' ROCm/HIP: {hip}')
assert torch.cuda.is_available(), 'No GPU detected!'
print(f' GPU: {torch.cuda.get_device_name(0)}')
vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
print(f' VRAM: {vram:.0f} GB')
print(' [OK] PyTorch + ROCm verified')
"
# ── Step 2: Get the full project from HF ──────────────────────
echo ""
echo "[2/7] Getting MINDI 1.5 from HuggingFace ..."
if [ -f "requirements.txt" ]; then
echo " Already in repo β€” pulling latest ..."
git pull
else
git clone https://Mindigenous:${HF_TOKEN}@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder
cd MINDI-1.5-Vision-Coder
fi
# ── Step 3: Install Python requirements ────────────────────────
echo ""
echo "[3/7] Installing Python requirements ..."
pip install -r requirements.txt
# Additional training dependencies
pip install wandb huggingface_hub accelerate
# ── Step 4: Download training data from HF ─────────────────────
echo ""
echo "[4/7] Downloading training dataset ..."
python3 -c "
from huggingface_hub import snapshot_download
import os
snapshot_download(
repo_id='Mindigenous/MINDI-1.5-training-data',
repo_type='dataset',
local_dir='data/',
token=os.environ['HF_TOKEN']
)
print('Dataset downloaded!')
"
# Verify data files exist
echo " Checking data files ..."
if [ ! -f "data/processed/train.jsonl" ]; then
echo " ERROR: train.jsonl not found!"
exit 1
fi
if [ ! -f "data/processed/val.jsonl" ]; then
echo " ERROR: val.jsonl not found!"
exit 1
fi
TRAIN_SIZE=$(du -sh data/processed/train.jsonl | cut -f1)
VAL_SIZE=$(du -sh data/processed/val.jsonl | cut -f1)
echo " train.jsonl: ${TRAIN_SIZE}"
echo " val.jsonl: ${VAL_SIZE}"
# ── Step 5: Set environment variables ──────────────────────────
echo ""
echo "[5/7] Setting environment variables ..."
# ROCm / PyTorch settings for MI300X
# NOTE: Do NOT set HSA_OVERRIDE_GFX_VERSION β€” ROCm 7.0 has native gfx942 support
export PYTORCH_ROCM_ARCH="gfx942"
export HIP_VISIBLE_DEVICES=0
export TOKENIZERS_PARALLELISM=false
export WANDB_PROJECT="mindi-1.5-vision-coder"
# Create .env file for the project
cat > .env << EOF
HF_TOKEN=${HF_TOKEN}
PYTORCH_ROCM_ARCH=gfx942
HIP_VISIBLE_DEVICES=0
TOKENIZERS_PARALLELISM=false
WANDB_PROJECT=mindi-1.5-vision-coder
EOF
# Also add to bashrc so env persists across docker exec sessions
grep -q "HSA_OVERRIDE_GFX_VERSION" ~/.bashrc 2>/dev/null || cat >> ~/.bashrc << 'ENVEOF'
# MINDI 1.5 MI300X environment
export PYTORCH_ROCM_ARCH=gfx942
export HIP_VISIBLE_DEVICES=0
export TOKENIZERS_PARALLELISM=false
export WANDB_PROJECT=mindi-1.5-vision-coder
ENVEOF
echo " .env file created + bashrc updated"
# ── Step 6: GPU stress test ────────────────────────────────────
echo ""
echo "[6/7] Running GPU verification + bf16 test ..."
python3 -c "
import torch
print(f' PyTorch version: {torch.__version__}')
print(f' CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
print(f' GPU name: {torch.cuda.get_device_name(0)}')
vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
print(f' VRAM: {vram:.0f} GB')
print(f' ROCm backend: {torch.version.hip is not None}')
# bf16 matmul test
x = torch.randn(1000, 1000, dtype=torch.bfloat16, device='cuda')
y = torch.matmul(x, x.T)
print(f' bf16 matmul: PASSED (shape={y.shape})')
# Memory allocation test
big = torch.zeros(1024, 1024, 1024, dtype=torch.bfloat16, device='cuda') # ~2GB
print(f' 2GB alloc test: PASSED')
del big
torch.cuda.empty_cache()
else:
print(' ERROR: No GPU detected!')
exit(1)
"
# ── Step 7: Create output directories ─────────────────────────
echo ""
echo "[7/7] Creating output directories ..."
mkdir -p checkpoints/training
mkdir -p checkpoints/best
mkdir -p logs/training
# ── Done ───────────────────────────────────────────────────────
echo ""
echo "============================================================"
echo " MINDI 1.5 Vision-Coder β€” MI300X Ready!"
echo ""
echo " Project: $(pwd)"
echo " Data: ${TRAIN_SIZE} train / ${VAL_SIZE} val"
echo " GPU: $(python -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null || echo 'N/A')"
echo ""
echo " Ready to train!"
echo " Run: python scripts/train.py --phase 1"
echo ""
echo " Or dry run first:"
echo " Run: python scripts/train.py --dry_run --no_wandb"
echo "============================================================"