#!/bin/bash # ============================================================ # MINDI 1.5 Vision-Coder — MI300X Setup Script # Run INSIDE the Docker container on DigitalOcean AMD MI300X # # On the host first: # docker exec -it rocm /bin/bash # export HF_TOKEN=hf_your_token_here # bash setup_mi300x.sh (if already cloned) # OR wget + bash (if fresh) # ============================================================ set -e echo "============================================================" echo " MINDI 1.5 Vision-Coder — MI300X Setup" echo " MINDIGENOUS.AI" echo " (Docker container environment)" echo "============================================================" echo "" # ── Check HF_TOKEN ───────────────────────────────────────────── if [ -z "$HF_TOKEN" ]; then echo "ERROR: Set HF_TOKEN environment variable first!" echo " export HF_TOKEN=hf_your_token_here" exit 1 fi # ── Step 1: Verify PyTorch + ROCm (already in Docker image) ─── echo "[1/7] Verifying PyTorch + ROCm (pre-installed in Docker) ..." python3 -c " import torch v = torch.__version__ hip = torch.version.hip or 'None' print(f' PyTorch: {v}') print(f' ROCm/HIP: {hip}') assert torch.cuda.is_available(), 'No GPU detected!' print(f' GPU: {torch.cuda.get_device_name(0)}') vram = torch.cuda.get_device_properties(0).total_mem / (1024**3) print(f' VRAM: {vram:.0f} GB') print(' [OK] PyTorch + ROCm verified') " # ── Step 2: Get the full project from HF ────────────────────── echo "" echo "[2/7] Getting MINDI 1.5 from HuggingFace ..." if [ -f "requirements.txt" ]; then echo " Already in repo — pulling latest ..." git pull else git clone https://Mindigenous:${HF_TOKEN}@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder cd MINDI-1.5-Vision-Coder fi # ── Step 3: Install Python requirements ──────────────────────── echo "" echo "[3/7] Installing Python requirements ..." pip install -r requirements.txt # Additional training dependencies pip install wandb huggingface_hub accelerate # ── Step 4: Download training data from HF ───────────────────── echo "" echo "[4/7] Downloading training dataset ..." python3 -c " from huggingface_hub import snapshot_download import os snapshot_download( repo_id='Mindigenous/MINDI-1.5-training-data', repo_type='dataset', local_dir='data/', token=os.environ['HF_TOKEN'] ) print('Dataset downloaded!') " # Verify data files exist echo " Checking data files ..." if [ ! -f "data/processed/train.jsonl" ]; then echo " ERROR: train.jsonl not found!" exit 1 fi if [ ! -f "data/processed/val.jsonl" ]; then echo " ERROR: val.jsonl not found!" exit 1 fi TRAIN_SIZE=$(du -sh data/processed/train.jsonl | cut -f1) VAL_SIZE=$(du -sh data/processed/val.jsonl | cut -f1) echo " train.jsonl: ${TRAIN_SIZE}" echo " val.jsonl: ${VAL_SIZE}" # ── Step 5: Set environment variables ────────────────────────── echo "" echo "[5/7] Setting environment variables ..." # ROCm / PyTorch settings for MI300X # NOTE: Do NOT set HSA_OVERRIDE_GFX_VERSION — ROCm 7.0 has native gfx942 support export PYTORCH_ROCM_ARCH="gfx942" export HIP_VISIBLE_DEVICES=0 export TOKENIZERS_PARALLELISM=false export WANDB_PROJECT="mindi-1.5-vision-coder" # Create .env file for the project cat > .env << EOF HF_TOKEN=${HF_TOKEN} PYTORCH_ROCM_ARCH=gfx942 HIP_VISIBLE_DEVICES=0 TOKENIZERS_PARALLELISM=false WANDB_PROJECT=mindi-1.5-vision-coder EOF # Also add to bashrc so env persists across docker exec sessions grep -q "HSA_OVERRIDE_GFX_VERSION" ~/.bashrc 2>/dev/null || cat >> ~/.bashrc << 'ENVEOF' # MINDI 1.5 MI300X environment export PYTORCH_ROCM_ARCH=gfx942 export HIP_VISIBLE_DEVICES=0 export TOKENIZERS_PARALLELISM=false export WANDB_PROJECT=mindi-1.5-vision-coder ENVEOF echo " .env file created + bashrc updated" # ── Step 6: GPU stress test ──────────────────────────────────── echo "" echo "[6/7] Running GPU verification + bf16 test ..." python3 -c " import torch print(f' PyTorch version: {torch.__version__}') print(f' CUDA available: {torch.cuda.is_available()}') if torch.cuda.is_available(): print(f' GPU name: {torch.cuda.get_device_name(0)}') vram = torch.cuda.get_device_properties(0).total_mem / (1024**3) print(f' VRAM: {vram:.0f} GB') print(f' ROCm backend: {torch.version.hip is not None}') # bf16 matmul test x = torch.randn(1000, 1000, dtype=torch.bfloat16, device='cuda') y = torch.matmul(x, x.T) print(f' bf16 matmul: PASSED (shape={y.shape})') # Memory allocation test big = torch.zeros(1024, 1024, 1024, dtype=torch.bfloat16, device='cuda') # ~2GB print(f' 2GB alloc test: PASSED') del big torch.cuda.empty_cache() else: print(' ERROR: No GPU detected!') exit(1) " # ── Step 7: Create output directories ───────────────────────── echo "" echo "[7/7] Creating output directories ..." mkdir -p checkpoints/training mkdir -p checkpoints/best mkdir -p logs/training # ── Done ─────────────────────────────────────────────────────── echo "" echo "============================================================" echo " MINDI 1.5 Vision-Coder — MI300X Ready!" echo "" echo " Project: $(pwd)" echo " Data: ${TRAIN_SIZE} train / ${VAL_SIZE} val" echo " GPU: $(python -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null || echo 'N/A')" echo "" echo " Ready to train!" echo " Run: python scripts/train.py --phase 1" echo "" echo " Or dry run first:" echo " Run: python scripts/train.py --dry_run --no_wandb" echo "============================================================"