+ echo '--------- Environment sanity check ---------' + echo 'shell: ./sft-tools.sh running under bash 5.0.17(1)-release' + echo 'conda env: pda' ++ which python + echo 'python: /home/panda/miniconda3/envs/pda/bin/python' + python - ++ command -v deepspeed + echo 'deepspeed: /home/panda/miniconda3/envs/pda/bin/deepspeed' + echo -------------------------------------------- + CUDA_VISIBLE_DEVICES=0,1 + deepspeed --master_port 29753 --module safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft false --max_length 4096 --trust_remote_code True --epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-false-1-100-16-4096 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 100 --resilient_coeff 1 --lora_r 16 --lora_alpha 32 --lora_dropout 0.05 --gradient_checkpointing --bf16 True --fp16 False --tf32 False [rank1]:[W511 17:36:45.026182293 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. [rank0]:[W511 17:36:45.057218115 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. Loading checkpoint shards: 0%| | 0/4 [00:00