| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -e |
|
|
| echo "============================================================" |
| echo " MINDI 1.5 Vision-Coder β MI300X Setup" |
| echo " MINDIGENOUS.AI" |
| echo " (Docker container environment)" |
| echo "============================================================" |
| echo "" |
|
|
| |
| if [ -z "$HF_TOKEN" ]; then |
| echo "ERROR: Set HF_TOKEN environment variable first!" |
| echo " export HF_TOKEN=hf_your_token_here" |
| exit 1 |
| fi |
|
|
| |
| echo "[1/7] Verifying PyTorch + ROCm (pre-installed in Docker) ..." |
| python3 -c " |
| import torch |
| v = torch.__version__ |
| hip = torch.version.hip or 'None' |
| print(f' PyTorch: {v}') |
| print(f' ROCm/HIP: {hip}') |
| assert torch.cuda.is_available(), 'No GPU detected!' |
| print(f' GPU: {torch.cuda.get_device_name(0)}') |
| vram = torch.cuda.get_device_properties(0).total_mem / (1024**3) |
| print(f' VRAM: {vram:.0f} GB') |
| print(' [OK] PyTorch + ROCm verified') |
| " |
|
|
| |
| echo "" |
| echo "[2/7] Getting MINDI 1.5 from HuggingFace ..." |
| if [ -f "requirements.txt" ]; then |
| echo " Already in repo β pulling latest ..." |
| git pull |
| else |
| git clone https://Mindigenous:${HF_TOKEN}@huggingface.co/Mindigenous/MINDI-1.5-Vision-Coder |
| cd MINDI-1.5-Vision-Coder |
| fi |
|
|
| |
| echo "" |
| echo "[3/7] Installing Python requirements ..." |
| pip install -r requirements.txt |
|
|
| |
| pip install wandb huggingface_hub accelerate |
|
|
| |
| echo "" |
| echo "[4/7] Downloading training dataset ..." |
| python3 -c " |
| from huggingface_hub import snapshot_download |
| import os |
| |
| snapshot_download( |
| repo_id='Mindigenous/MINDI-1.5-training-data', |
| repo_type='dataset', |
| local_dir='data/', |
| token=os.environ['HF_TOKEN'] |
| ) |
| print('Dataset downloaded!') |
| " |
|
|
| |
| echo " Checking data files ..." |
| if [ ! -f "data/processed/train.jsonl" ]; then |
| echo " ERROR: train.jsonl not found!" |
| exit 1 |
| fi |
| if [ ! -f "data/processed/val.jsonl" ]; then |
| echo " ERROR: val.jsonl not found!" |
| exit 1 |
| fi |
| TRAIN_SIZE=$(du -sh data/processed/train.jsonl | cut -f1) |
| VAL_SIZE=$(du -sh data/processed/val.jsonl | cut -f1) |
| echo " train.jsonl: ${TRAIN_SIZE}" |
| echo " val.jsonl: ${VAL_SIZE}" |
|
|
| |
| echo "" |
| echo "[5/7] Setting environment variables ..." |
|
|
| |
| |
| export PYTORCH_ROCM_ARCH="gfx942" |
| export HIP_VISIBLE_DEVICES=0 |
| export TOKENIZERS_PARALLELISM=false |
| export WANDB_PROJECT="mindi-1.5-vision-coder" |
|
|
| |
| cat > .env << EOF |
| HF_TOKEN=${HF_TOKEN} |
| PYTORCH_ROCM_ARCH=gfx942 |
| HIP_VISIBLE_DEVICES=0 |
| TOKENIZERS_PARALLELISM=false |
| WANDB_PROJECT=mindi-1.5-vision-coder |
| EOF |
|
|
| |
| grep -q "HSA_OVERRIDE_GFX_VERSION" ~/.bashrc 2>/dev/null || cat >> ~/.bashrc << 'ENVEOF' |
|
|
| |
| export PYTORCH_ROCM_ARCH=gfx942 |
| export HIP_VISIBLE_DEVICES=0 |
| export TOKENIZERS_PARALLELISM=false |
| export WANDB_PROJECT=mindi-1.5-vision-coder |
| ENVEOF |
| echo " .env file created + bashrc updated" |
|
|
| |
| echo "" |
| echo "[6/7] Running GPU verification + bf16 test ..." |
| python3 -c " |
| import torch |
| print(f' PyTorch version: {torch.__version__}') |
| print(f' CUDA available: {torch.cuda.is_available()}') |
| if torch.cuda.is_available(): |
| print(f' GPU name: {torch.cuda.get_device_name(0)}') |
| vram = torch.cuda.get_device_properties(0).total_mem / (1024**3) |
| print(f' VRAM: {vram:.0f} GB') |
| print(f' ROCm backend: {torch.version.hip is not None}') |
| # bf16 matmul test |
| x = torch.randn(1000, 1000, dtype=torch.bfloat16, device='cuda') |
| y = torch.matmul(x, x.T) |
| print(f' bf16 matmul: PASSED (shape={y.shape})') |
| # Memory allocation test |
| big = torch.zeros(1024, 1024, 1024, dtype=torch.bfloat16, device='cuda') # ~2GB |
| print(f' 2GB alloc test: PASSED') |
| del big |
| torch.cuda.empty_cache() |
| else: |
| print(' ERROR: No GPU detected!') |
| exit(1) |
| " |
|
|
| |
| echo "" |
| echo "[7/7] Creating output directories ..." |
| mkdir -p checkpoints/training |
| mkdir -p checkpoints/best |
| mkdir -p logs/training |
|
|
| |
| echo "" |
| echo "============================================================" |
| echo " MINDI 1.5 Vision-Coder β MI300X Ready!" |
| echo "" |
| echo " Project: $(pwd)" |
| echo " Data: ${TRAIN_SIZE} train / ${VAL_SIZE} val" |
| echo " GPU: $(python -c 'import torch; print(torch.cuda.get_device_name(0))' 2>/dev/null || echo 'N/A')" |
| echo "" |
| echo " Ready to train!" |
| echo " Run: python scripts/train.py --phase 1" |
| echo "" |
| echo " Or dry run first:" |
| echo " Run: python scripts/train.py --dry_run --no_wandb" |
| echo "============================================================" |
|
|