| /home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/cuda/__init__.py:829: UserWarning: Can't initialize NVML |
| warnings.warn("Can't initialize NVML") |
| /home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/cuda/__init__.py:1036: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.) |
| r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count |
| /home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/cuda/__init__.py:182: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.) |
| return torch._C._cuda_getDeviceCount() > 0 |
| /home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/cuda/__init__.py:182: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.) |
| return torch._C._cuda_getDeviceCount() > 0 |
| Traceback (most recent call last): |
| Traceback (most recent call last): |
| File "/data/home/yliu/sct-rlhf/MultiPref_PPO/train_ppo.py", line 106, in <module> |
| File "/data/home/yliu/sct-rlhf/MultiPref_PPO/train_ppo.py", line 106, in <module> |
| script_args, training_args, model_args = parser.parse_args_into_dataclasses() |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses |
| script_args, training_args, model_args = parser.parse_args_into_dataclasses() |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses |
| obj = dtype(**inputs) |
| File "<string>", line 165, in __init__ |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/trl/trainer/utils.py", line 873, in __post_init__ |
| obj = dtype(**inputs) |
| File "<string>", line 165, in __init__ |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/trl/trainer/utils.py", line 873, in __post_init__ |
| super().__post_init__() |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/transformers/training_args.py", line 1729, in __post_init__ |
| super().__post_init__() |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/transformers/training_args.py", line 1729, in __post_init__ |
| raise ValueError(error_message) |
| ValueError: Your setup doesn't support bf16/gpu. |
| raise ValueError(error_message) |
| ValueError: Your setup doesn't support bf16/gpu. |
| /home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/cuda/__init__.py:182: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:109.) |
| return torch._C._cuda_getDeviceCount() > 0 |
| Traceback (most recent call last): |
| File "/data/home/yliu/sct-rlhf/MultiPref_PPO/train_ppo.py", line 106, in <module> |
| script_args, training_args, model_args = parser.parse_args_into_dataclasses() |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses |
| obj = dtype(**inputs) |
| File "<string>", line 165, in __init__ |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/trl/trainer/utils.py", line 873, in __post_init__ |
| super().__post_init__() |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/transformers/training_args.py", line 1729, in __post_init__ |
| raise ValueError(error_message) |
| ValueError: Your setup doesn't support bf16/gpu. |
| E0826 22:40:47.705000 3041180 site-packages/torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 3041233) of binary: /home/yliu/miniconda3/envs/trl/bin/python3.10 |
| Traceback (most recent call last): |
| File "/home/yliu/miniconda3/envs/trl/bin/accelerate", line 8, in <module> |
| sys.exit(main()) |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 50, in main |
| args.func(args) |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/accelerate/commands/launch.py", line 1226, in launch_command |
| multi_gpu_launcher(args) |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/accelerate/commands/launch.py", line 853, in multi_gpu_launcher |
| distrib_run.run(args) |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run |
| elastic_launch( |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 143, in __call__ |
| return launch_agent(self._config, self._entrypoint, list(args)) |
| File "/home/yliu/miniconda3/envs/trl/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent |
| raise ChildFailedError( |
| torch.distributed.elastic.multiprocessing.errors.ChildFailedError: |
| ============================================================ |
| train_ppo.py FAILED |
| |
| Failures: |
| [1]: |
| time : 2025-08-26_22:40:47 |
| host : tree |
| rank : 1 (local_rank: 1) |
| exitcode : 1 (pid: 3041234) |
| error_file: <N/A> |
| traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html |
| [2]: |
| time : 2025-08-26_22:40:47 |
| host : tree |
| rank : 2 (local_rank: 2) |
| exitcode : 1 (pid: 3041235) |
| error_file: <N/A> |
| traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html |
| |
| Root Cause (first observed failure): |
| [0]: |
| time : 2025-08-26_22:40:47 |
| host : tree |
| rank : 0 (local_rank: 0) |
| exitcode : 1 (pid: 3041233) |
| error_file: <N/A> |
| traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html |
| ============================================================ |
|
|