File size: 2,076 Bytes
625a17f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
#SBATCH --job-name=psalm_retrain_FullJson
#SBATCH --nodes=1               # Request 1 node
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=a100-40g:4
#SBATCH --cpus-per-gpu=8       # Number of CPU cores (threads) per task
#SBATCH --mem-per-gpu=40G        # Memory limit per CPU core (there is no --mem-per-task)
#SBATCH --time=96:00:00         # Job timeout
#SBATCH --output=OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2.log      # Redirect stdout to a log file
#SBATCH --nodelist=gcp-us-3


head_node=$(hostname)
rdzv_port=$((30000+SLURM_JOB_ID%30000))
# 设置网络接口(根据你的环境选择 eth 或其他接口)
export NCCL_SOCKET_IFNAME=eth  # 检查是否需要修改 eth0 为正确的网络接口

# 获取 GPU UUID 列表并转换为 GPU 索引
gpu_uuids=$(nvidia-smi --query-gpu=uuid --format=csv,noheader)
gpu_indices=$(nvidia-smi --query-gpu=index --format=csv,noheader)

# 显示 GPU UUID 和对应的索引
echo "GPU UUIDs and corresponding indices:"
nvidia-smi --query-gpu=index,uuid --format=csv

# 创建 UUID -> index 映射并动态设置 CUDA_VISIBLE_DEVICES
index=0
visible_devices=""
for uuid in $gpu_uuids; do
    if [ -z "$visible_devices" ]; then
        visible_devices="$index"
    else
        visible_devices="$visible_devices,$index"
    fi
    index=$((index + 1))
done

# 设置 CUDA_VISIBLE_DEVICES 环境变量为 GPU 索引
export CUDA_VISIBLE_DEVICES=$visible_devices

# 检查是否正确设置
echo "CUDA_VISIBLE_DEVICES set to: $CUDA_VISIBLE_DEVICES"


srun --nodes "$SLURM_NNODES" --ntasks-per-node 1 -- \ 
mkenv -f psalm.yml -- \
sh -c "
    bash ./scripts/train_SSL_MultiCondition.sh &&
    python psalm/eval/eval_ego4d_MultiCondition.py --image_folder /data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap --model_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/OursMultiCondition_EgoQuery_SmallJson_1102_CAwithoutResidual_1Head_TwoStageS2 --json_path /data/work-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap/egoexo_val_framelevel_newprompt_all_instruction.json
    "