Faaz commited on
Commit
35fd5fc
Β·
1 Parent(s): f04f58b

Fix setup_mi300x.sh for Docker container on MI300X droplet

Browse files
Files changed (1) hide show
  1. setup_mi300x.sh +51 -22
setup_mi300x.sh CHANGED
@@ -1,13 +1,20 @@
1
  #!/bin/bash
2
  # ============================================================
3
  # MINDI 1.5 Vision-Coder β€” MI300X Setup Script
4
- # One command to set up everything on DigitalOcean AMD MI300X
 
 
 
 
 
 
5
  # ============================================================
6
  set -e
7
 
8
  echo "============================================================"
9
  echo " MINDI 1.5 Vision-Coder β€” MI300X Setup"
10
  echo " MINDIGENOUS.AI"
 
11
  echo "============================================================"
12
  echo ""
13
 
@@ -18,10 +25,20 @@ if [ -z "$HF_TOKEN" ]; then
18
  exit 1
19
  fi
20
 
21
- # ── Step 1: Install ROCm PyTorch ───────────────────────────────
22
- echo "[1/7] Installing ROCm PyTorch (ROCm 6.0) ..."
23
- pip install torch torchvision torchaudio \
24
- --index-url https://download.pytorch.org/whl/rocm6.0
 
 
 
 
 
 
 
 
 
 
25
 
26
  # ── Step 2: Get the full project from HF ──────────────────────
27
  echo ""
@@ -45,7 +62,7 @@ pip install wandb huggingface_hub accelerate
45
  # ── Step 4: Download training data from HF ─────────────────────
46
  echo ""
47
  echo "[4/7] Downloading training dataset ..."
48
- python -c "
49
  from huggingface_hub import snapshot_download
50
  import os
51
 
@@ -77,14 +94,14 @@ echo " val.jsonl: ${VAL_SIZE}"
77
  echo ""
78
  echo "[5/7] Setting environment variables ..."
79
 
80
- # ROCm / PyTorch settings
81
  export HSA_OVERRIDE_GFX_VERSION=11.0.0
82
  export PYTORCH_ROCM_ARCH="gfx942"
83
  export HIP_VISIBLE_DEVICES=0
84
  export TOKENIZERS_PARALLELISM=false
85
  export WANDB_PROJECT="mindi-1.5-vision-coder"
86
 
87
- # Create .env file
88
  cat > .env << EOF
89
  HF_TOKEN=${HF_TOKEN}
90
  HSA_OVERRIDE_GFX_VERSION=11.0.0
@@ -93,33 +110,45 @@ HIP_VISIBLE_DEVICES=0
93
  TOKENIZERS_PARALLELISM=false
94
  WANDB_PROJECT=mindi-1.5-vision-coder
95
  EOF
96
- echo " .env file created"
97
 
98
- # ── Step 6: Verify GPU detected ───────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
99
  echo ""
100
- echo "[6/7] Verifying GPU ..."
101
- python -c "
102
  import torch
103
  print(f' PyTorch version: {torch.__version__}')
104
  print(f' CUDA available: {torch.cuda.is_available()}')
105
  if torch.cuda.is_available():
106
  print(f' GPU name: {torch.cuda.get_device_name(0)}')
107
  vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
108
- print(f' VRAM: {vram:.1f} GB')
109
  print(f' ROCm backend: {torch.version.hip is not None}')
 
 
 
 
 
 
 
 
 
110
  else:
111
- print(' WARNING: No GPU detected!')
112
  exit(1)
113
  "
114
 
115
- # Quick bf16 test
116
- python -c "
117
- import torch
118
- x = torch.randn(100, 100, dtype=torch.bfloat16, device='cuda')
119
- y = torch.matmul(x, x.T)
120
- print(f' bf16 matmul test: PASSED (shape={y.shape})')
121
- "
122
-
123
  # ── Step 7: Create output directories ─────────────────────────
124
  echo ""
125
  echo "[7/7] Creating output directories ..."
 
1
  #!/bin/bash
2
  # ============================================================
3
  # MINDI 1.5 Vision-Coder β€” MI300X Setup Script
4
+ # Run INSIDE the Docker container on DigitalOcean AMD MI300X
5
+ #
6
+ # On the host first:
7
+ # docker exec -it rocm /bin/bash
8
+ # export HF_TOKEN=hf_your_token_here
9
+ # bash setup_mi300x.sh (if already cloned)
10
+ # OR wget + bash (if fresh)
11
  # ============================================================
12
  set -e
13
 
14
  echo "============================================================"
15
  echo " MINDI 1.5 Vision-Coder β€” MI300X Setup"
16
  echo " MINDIGENOUS.AI"
17
+ echo " (Docker container environment)"
18
  echo "============================================================"
19
  echo ""
20
 
 
25
  exit 1
26
  fi
27
 
28
+ # ── Step 1: Verify PyTorch + ROCm (already in Docker image) ───
29
+ echo "[1/7] Verifying PyTorch + ROCm (pre-installed in Docker) ..."
30
+ python3 -c "
31
+ import torch
32
+ v = torch.__version__
33
+ hip = torch.version.hip or 'None'
34
+ print(f' PyTorch: {v}')
35
+ print(f' ROCm/HIP: {hip}')
36
+ assert torch.cuda.is_available(), 'No GPU detected!'
37
+ print(f' GPU: {torch.cuda.get_device_name(0)}')
38
+ vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
39
+ print(f' VRAM: {vram:.0f} GB')
40
+ print(' [OK] PyTorch + ROCm verified')
41
+ "
42
 
43
  # ── Step 2: Get the full project from HF ──────────────────────
44
  echo ""
 
62
  # ── Step 4: Download training data from HF ─────────────────────
63
  echo ""
64
  echo "[4/7] Downloading training dataset ..."
65
+ python3 -c "
66
  from huggingface_hub import snapshot_download
67
  import os
68
 
 
94
  echo ""
95
  echo "[5/7] Setting environment variables ..."
96
 
97
+ # ROCm / PyTorch settings for MI300X
98
  export HSA_OVERRIDE_GFX_VERSION=11.0.0
99
  export PYTORCH_ROCM_ARCH="gfx942"
100
  export HIP_VISIBLE_DEVICES=0
101
  export TOKENIZERS_PARALLELISM=false
102
  export WANDB_PROJECT="mindi-1.5-vision-coder"
103
 
104
+ # Create .env file for the project
105
  cat > .env << EOF
106
  HF_TOKEN=${HF_TOKEN}
107
  HSA_OVERRIDE_GFX_VERSION=11.0.0
 
110
  TOKENIZERS_PARALLELISM=false
111
  WANDB_PROJECT=mindi-1.5-vision-coder
112
  EOF
 
113
 
114
+ # Also add to bashrc so env persists across docker exec sessions
115
+ grep -q "HSA_OVERRIDE_GFX_VERSION" ~/.bashrc 2>/dev/null || cat >> ~/.bashrc << 'ENVEOF'
116
+
117
+ # MINDI 1.5 MI300X environment
118
+ export HSA_OVERRIDE_GFX_VERSION=11.0.0
119
+ export PYTORCH_ROCM_ARCH=gfx942
120
+ export HIP_VISIBLE_DEVICES=0
121
+ export TOKENIZERS_PARALLELISM=false
122
+ export WANDB_PROJECT=mindi-1.5-vision-coder
123
+ ENVEOF
124
+ echo " .env file created + bashrc updated"
125
+
126
+ # ── Step 6: GPU stress test ────────────────────────────────────
127
  echo ""
128
+ echo "[6/7] Running GPU verification + bf16 test ..."
129
+ python3 -c "
130
  import torch
131
  print(f' PyTorch version: {torch.__version__}')
132
  print(f' CUDA available: {torch.cuda.is_available()}')
133
  if torch.cuda.is_available():
134
  print(f' GPU name: {torch.cuda.get_device_name(0)}')
135
  vram = torch.cuda.get_device_properties(0).total_mem / (1024**3)
136
+ print(f' VRAM: {vram:.0f} GB')
137
  print(f' ROCm backend: {torch.version.hip is not None}')
138
+ # bf16 matmul test
139
+ x = torch.randn(1000, 1000, dtype=torch.bfloat16, device='cuda')
140
+ y = torch.matmul(x, x.T)
141
+ print(f' bf16 matmul: PASSED (shape={y.shape})')
142
+ # Memory allocation test
143
+ big = torch.zeros(1024, 1024, 1024, dtype=torch.bfloat16, device='cuda') # ~2GB
144
+ print(f' 2GB alloc test: PASSED')
145
+ del big
146
+ torch.cuda.empty_cache()
147
  else:
148
+ print(' ERROR: No GPU detected!')
149
  exit(1)
150
  "
151
 
 
 
 
 
 
 
 
 
152
  # ── Step 7: Create output directories ─────────────────────────
153
  echo ""
154
  echo "[7/7] Creating output directories ..."