#!/bin/bash # Single-GPU training to avoid NCCL inter-GPU communication issues on vast.ai # Uses 1x H100 with gradient accumulation to match effective batch size set -ex cd /root/arcisvlm export HF_TOKEN=$HF_TOKEN export CUDA_VISIBLE_DEVICES=0 # Clean corrupted checkpoints rm -f checkpoints/stage2_epoch*.pt checkpoints/stage2_final.pt checkpoints/stage3_*.pt echo "=== Stage 2: Single GPU Training ===" date nvidia-smi --query-gpu=name,memory.total --format=csv,noheader # Single GPU — no torchrun, no DDP # Use nproc=1 so DDP is trivial (single process group) torchrun --nproc_per_node=1 --master_port=29503 \ scripts/train_stage2_ddp.py \ --config configs/scale_1.3b.yaml \ --stage1_ckpt checkpoints/v3_stage1_final.pt STAGE2_EXIT=$? echo "Stage 2 exit code: $STAGE2_EXIT" date if [ $STAGE2_EXIT -ne 0 ]; then echo "!!! Stage 2 FAILED with exit code $STAGE2_EXIT !!!" echo "Checking for partial checkpoints..." ls -lh checkpoints/stage2_*.pt 2>/dev/null || echo "No stage2 checkpoints found" exit 1 fi echo "=== Pushing Stage 2 to HF ===" python3 scripts/push_to_hf.py stage2_final.pt v4_stage2_final.pt echo "=== Stage 3: Single GPU Training ===" date torchrun --nproc_per_node=1 --master_port=29503 \ scripts/train_stage3_ddp.py \ --config configs/scale_1.3b.yaml \ --stage2_ckpt checkpoints/stage2_final.pt STAGE3_EXIT=$? echo "Stage 3 exit code: $STAGE3_EXIT" date if [ $STAGE3_EXIT -ne 0 ]; then echo "!!! Stage 3 FAILED with exit code $STAGE3_EXIT !!!" ls -lh checkpoints/stage3_*.pt 2>/dev/null || echo "No stage3 checkpoints found" exit 1 fi echo "=== Pushing Stage 3 to HF ===" python3 scripts/push_to_hf.py stage3_final.pt v4_stage3_final.pt echo "=== ALL TRAINING COMPLETE ===" date echo "Checkpoints:" ls -lh checkpoints/