ObjectRelator-Original / TrainandTest_SSL_multicondition.sh
YuqianFu's picture
Upload folder using huggingface_hub
625a17f verified
#!/bin/bash
#SBATCH --job-name=psalm_retrain_FullJson
#SBATCH --nodes=1 # Request 1 node
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=a100-40g:4
#SBATCH --cpus-per-gpu=8 # Number of CPU cores (threads) per task
#SBATCH --mem-per-gpu=40G # Memory limit per CPU core (there is no --mem-per-task)
#SBATCH --time=96:00:00 # Job timeout
#SBATCH --output=OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2.log # Redirect stdout to a log file
#SBATCH --nodelist=gcp-us-3
head_node=$(hostname)
rdzv_port=$((30000+SLURM_JOB_ID%30000))
# 设置网络接口(根据你的环境选择 eth 或其他接口)
export NCCL_SOCKET_IFNAME=eth # 检查是否需要修改 eth0 为正确的网络接口
# 获取 GPU UUID 列表并转换为 GPU 索引
gpu_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader)
gpu_indices=$(nvidia-smi --query-gpu=index --format=csv,noheader)
# 显示 GPU UUID 和对应的索引
echo "GPU UUIDs and corresponding indices:"
nvidia-smi --query-gpu=index,uuid --format=csv
# 创建 UUID -> index 映射并动态设置 CUDA_VISIBLE_DEVICES
index=0
visible_devices=""
for uuid in $gpu_uuids; do
if [ -z "$visible_devices" ]; then
visible_devices="$index"
else
visible_devices="$visible_devices,$index"
fi
index=$((index + 1))
done
# 设置 CUDA_VISIBLE_DEVICES 环境变量为 GPU 索引
export CUDA_VISIBLE_DEVICES=$visible_devices
# 检查是否正确设置
echo "CUDA_VISIBLE_DEVICES set to: $CUDA_VISIBLE_DEVICES"
srun --nodes "$SLURM_NNODES" --ntasks-per-node 1 -- \
mkenv -f psalm.yml -- \
sh -c "
bash ./scripts/train_SSL_MultiCondition.sh &&
python psalm/eval/eval_ego4d_MultiCondition.py --image_folder /data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap --model_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2 --json_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap/egoexo_val_framelevel_newprompt_all_instruction.json
"