#!/bin/bash #SBATCH --job-name=psalm_retrain_FullJson #SBATCH --nodes=1 # Request 1 node #SBATCH --ntasks-per-node=1 #SBATCH --gpus-per-task=a100-40g:4 #SBATCH --cpus-per-gpu=8 # Number of CPU cores (threads) per task #SBATCH --mem-per-gpu=40G # Memory limit per CPU core (there is no --mem-per-task) #SBATCH --time=96:00:00 # Job timeout #SBATCH --output=OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2.log # Redirect stdout to a log file #SBATCH --nodelist=gcp-us-3 head_node=$(hostname) rdzv_port=$((30000+SLURM_JOB_ID%30000)) # 设置网络接口(根据你的环境选择 eth 或其他接口) export NCCL_SOCKET_IFNAME=eth # 检查是否需要修改 eth0 为正确的网络接口 # 获取 GPU UUID 列表并转换为 GPU 索引 gpu_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader) gpu_indices=$(nvidia-smi --query-gpu=index --format=csv,noheader) # 显示 GPU UUID 和对应的索引 echo "GPU UUIDs and corresponding indices:" nvidia-smi --query-gpu=index,uuid --format=csv # 创建 UUID -> index 映射并动态设置 CUDA_VISIBLE_DEVICES index=0 visible_devices="" for uuid in $gpu_uuids; do if [ -z "$visible_devices" ]; then visible_devices="$index" else visible_devices="$visible_devices,$index" fi index=$((index + 1)) done # 设置 CUDA_VISIBLE_DEVICES 环境变量为 GPU 索引 export CUDA_VISIBLE_DEVICES=$visible_devices # 检查是否正确设置 echo "CUDA_VISIBLE_DEVICES set to: $CUDA_VISIBLE_DEVICES" srun --nodes "$SLURM_NNODES" --ntasks-per-node 1 -- \ mkenv -f psalm.yml -- \ sh -c " bash ./scripts/train_SSL_MultiCondition.sh && python psalm/eval/eval_ego4d_MultiCondition.py --image_folder /data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap --model_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2 --json_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap/egoexo_val_framelevel_newprompt_all_instruction.json "