| #SBATCH --job-name=psalm_retrain_FullJson | |
| #SBATCH --nodes=1 # Request 1 node | |
| #SBATCH --ntasks-per-node=1 | |
| #SBATCH --gpus-per-task=a100-40g:4 | |
| #SBATCH --cpus-per-gpu=8 # Number of CPU cores (threads) per task | |
| #SBATCH --mem-per-gpu=40G # Memory limit per CPU core (there is no --mem-per-task) | |
| #SBATCH --time=96:00:00 # Job timeout | |
| #SBATCH --output=OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2.log # Redirect stdout to a log file | |
| #SBATCH --nodelist=gcp-us-3 | |
| head_node=$(hostname) | |
| rdzv_port=$((30000+SLURM_JOB_ID%30000)) | |
| # 设置网络接口(根据你的环境选择 eth 或其他接口) | |
| export NCCL_SOCKET_IFNAME=eth # 检查是否需要修改 eth0 为正确的网络接口 | |
| # 获取 GPU UUID 列表并转换为 GPU 索引 | |
| gpu_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader) | |
| gpu_indices=$(nvidia-smi --query-gpu=index --format=csv,noheader) | |
| # 显示 GPU UUID 和对应的索引 | |
| echo "GPU UUIDs and corresponding indices:" | |
| nvidia-smi --query-gpu=index,uuid --format=csv | |
| # 创建 UUID -> index 映射并动态设置 CUDA_VISIBLE_DEVICES | |
| index=0 | |
| visible_devices="" | |
| for uuid in $gpu_uuids; do | |
| if [ -z "$visible_devices" ]; then | |
| visible_devices="$index" | |
| else | |
| visible_devices="$visible_devices,$index" | |
| fi | |
| index=$((index + 1)) | |
| done | |
| # 设置 CUDA_VISIBLE_DEVICES 环境变量为 GPU 索引 | |
| export CUDA_VISIBLE_DEVICES=$visible_devices | |
| # 检查是否正确设置 | |
| echo "CUDA_VISIBLE_DEVICES set to: $CUDA_VISIBLE_DEVICES" | |
| srun --nodes "$SLURM_NNODES" --ntasks-per-node 1 -- \ | |
| mkenv -f psalm.yml -- \ | |
| sh -c " | |
| bash ./scripts/train_SSL_MultiCondition.sh && | |
| python psalm/eval/eval_ego4d_MultiCondition.py --image_folder /data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap --model_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2 --json_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap/egoexo_val_framelevel_newprompt_all_instruction.json | |
| " | |