lingbot-vla / train.sh
bazaar-research's picture
Upload folder using huggingface_hub
fb11af9 verified
#!/bin/bash
set -x
export TOKENIZERS_PARALLELISM=false
if [ -z "$CUDA_VISIBLE_DEVICES" ]; then
NPROC_PER_NODE=$(nvidia-smi -L | wc -l)
else
# 可见 GPU 数量
NPROC_PER_NODE=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
fi
echo "Using NPROC_PER_NODE=$NPROC_PER_NODE GPUs"
NNODES=${NNODES:=1}
NPROC_PER_NODE=${NPROC_PER_NODE:=$NPROC_PER_NODE}
NODE_RANK=${NODE_RANK:=0}
MASTER_ADDR=${MASTER_ADDR:=0.0.0.0}
MASTER_PORT=${MASTER_PORT:=62500}
torchrun --nnodes=$NNODES --nproc-per-node $NPROC_PER_NODE --node-rank $NODE_RANK \
--master-addr=$MASTER_ADDR --master-port=$MASTER_PORT $@ 2>&1 | tee log.txt