diff --git "a/stderr.log" "b/stderr.log" new file mode 100644--- /dev/null +++ "b/stderr.log" @@ -0,0 +1,76 @@ ++ echo '--------- Environment sanity check ---------' ++ echo 'shell: ./sft-tools.sh running under bash 5.0.17(1)-release' ++ echo 'conda env: pda' +++ which python ++ echo 'python: /home/panda/miniconda3/envs/pda/bin/python' ++ python - +++ command -v deepspeed ++ echo 'deepspeed: /home/panda/miniconda3/envs/pda/bin/deepspeed' ++ echo -------------------------------------------- ++ CUDA_VISIBLE_DEVICES=0,1 ++ deepspeed --master_port 35777 --module safe_rlhf.algorithms.tools_ft --train_datasets tools --model_name_or_path meta-llama/Llama-3.1-8B-Instruct --cache_dir /home/panda/pda-llm/cache/sft-tools --important_sft false --max_length 2048 --trust_remote_code True --epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 48 --gradient_checkpointing --learning_rate 1e-4 --lr_scheduler_type cosine --lr_warmup_ratio 0.1 --weight_decay 0.0 --seed 42 --output_dir /home/panda/pda-llm/output/sft-tools/run-false-1-10 --log_type wandb --log_project TOOLS-SFT --zero_stage 0 --offload none --safety_ratio_tol 10 --resilient_coeff 1 --lora_r 16 --lora_alpha 32 --lora_dropout 0.05 --bf16 True --fp16 False --tf32 False +[rank1]:[W508 20:43:10.558049870 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank0]:[W508 20:43:10.580350715 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. + Loading checkpoint shards: 0%| | 0/4 [00:00