Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

script.sh +155 -0
stderr.log +520 -0
stdout.log +54 -0

script.sh ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+if [ -z "${BASH_VERSION}" ]; then
+	echo "Please use bash to run this script." >&2
+	exit 1
+fi
+set -x
+SCRIPT_DIR="$(cd "$(dirname "$0")" &>/dev/null && pwd)"
+ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
+export PYTHONPATH="${ROOT_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
+export LOGLEVEL="${LOGLEVEL:-WARNING}"
+# -------------------need to change-------------
+export LOGLEVEL="INFO"
+export WANDB_API_KEY="0e77f7c02e33b86269ca2123964b9fefcf9c1a7a"
+# MODEL_NAME_OR_PATH="/mnt/fl/models/reproduced/alpaca-7b-reproduced"
+# OUTPUT_DIR="/mnt/fl/projects/unalignment/models/alpaca-reproduced-s3-Q1"
+# DATASET="/mnt/fl/projects/unalignment/boyuan/workspace/inverse_alignment/ablation/setting2/dataset/training/safe/safe_train_20k.json"
+# -------------------need to change------------
+unset HOSTFILE
+ZERO_STAGE=3
+OFFLOAD="none"
+LOG_RUN_NAME='setting3-default'
+while [[ "$#" -gt 0 ]]; do
+	arg="$1"
+	shift
+	case "${arg}" in
+		--train_datasets)
+			DATASET="$1"
+			shift
+			;;
+		--train_datasets=*)
+			DATASET="${arg#*=}"
+			;;
+		--model_name_or_path)
+			MODEL_NAME_OR_PATH="$1"
+			shift
+			;;
+		--model_name_or_path=*)
+			MODEL_NAME_OR_PATH="${arg#*=}"
+			;;
+		--output_dir)
+			OUTPUT_DIR="$1"
+			shift
+			;;
+		--output_dir=*)
+			OUTPUT_DIR="${arg#*=}"
+			;;
+		--log_run_name)
+			LOG_RUN_NAME="$1"
+			shift
+			;;
+		--log_run_name=*)
+			LOG_RUN_NAME="${arg#*=}"
+			;;
+		--hostfile)
+			HOSTFILE="$1"
+			shift
+			;;
+		--hostfile=*)
+			HOSTFILE="${arg#*=}"
+			;;
+		--zero_stage)
+			ZERO_STAGE="$1"
+			shift
+			;;
+		--zero_stage=*)
+			ZERO_STAGE="${arg#*=}"
+			;;
+		--offload)
+			OFFLOAD="$1"
+			shift
+			;;
+		--offload=*)
+			OFFLOAD="${arg#*=}"
+			;;
+		*)
+			echo "Unknown parameter passed: '${arg}'" >&2
+			exit 1
+			;;
+	esac
+done
+mkdir -p "${OUTPUT_DIR}"
+OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
+if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
+	echo '*' >"${OUTPUT_DIR}/.gitignore"
+fi
+cp -f "$0" "${OUTPUT_DIR}/script.sh"
+MASTER_PORT_START=10000
+MASTER_PORT_END=65535
+MASTER_PORT="$(
+	comm -23 \
+		<(seq "${MASTER_PORT_START}" "${MASTER_PORT_END}" | sort) \
+		<(ss -Htan | awk '{ print $4 }' | awk -F ':' '{ print $NF }' | sort -u) |
+		shuf | head -n 1
+)"
+DEEPSPEED_ARGS=()
+if [[ -n "${HOSTFILE+x}" ]]; then
+	DEEPSPEED_ARGS+=("--hostfile" "${HOSTFILE}")
+fi
+DEEPSPEED_ARGS+=("--master_port" "${MASTER_PORT}")
+exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
+deepspeed "${DEEPSPEED_ARGS[@]}" \
+	--module safe_rlhf.finetune \
+	--train_datasets inverse-json::${DATASET} \
+	--model_name_or_path "${MODEL_NAME_OR_PATH}" \
+	--max_length 512 \
+	--trust_remote_code True \
+	--epochs 1 \
+	--per_device_train_batch_size 1 \
+	--per_device_eval_batch_size 4 \
+	--gradient_accumulation_steps 8 \
+	--gradient_checkpointing \
+	--learning_rate 1e-5 \
+	--lr_warmup_ratio 0 \
+	--weight_decay 0.0 \
+	--lr_scheduler_type constant \
+	--weight_decay 0.0 \
+	--seed 42 \
+	--output_dir "${OUTPUT_DIR}" \
+	--log_type wandb \
+	--log_run_name "${LOG_RUN_NAME}" \
+	--log_project Inverse_Alignment_IMDb \
+	--zero_stage "${ZERO_STAGE}" \
+	--offload "${OFFLOAD}" \
+	--bf16 True \
+	--tf32 True \
+	--save_16bit

stderr.log ADDED Viewed

	@@ -0,0 +1,520 @@

++ deepspeed --master_port 42100 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000 --log_type wandb --log_run_name imdb-tinyllama-2T-s3-Q1-2000-Q2-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
+[rank0]:[W527 16:15:55.420974946 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank7]:[W527 16:15:55.425584163 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7]  using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank6]:[W527 16:15:55.425592827 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6]  using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank3]:[W527 16:15:55.427664756 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3]  using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank5]:[W527 16:15:55.436563268 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5]  using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank4]:[W527 16:15:55.466237376 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4]  using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank2]:[W527 16:15:55.491583686 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2]  using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+[rank1]:[W527 16:15:55.492612717 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+Model config LlamaConfig {
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.52.1",
+  "use_cache": true,
+  "vocab_size": 32000
+}
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank0]:   File "<frozen runpy>", line 88, in _run_code
+[rank0]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank0]:     sys.exit(main())
+[rank0]:              ^^^^^^
+[rank0]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank0]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank0]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank0]:     self.init_models()
+[rank0]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank0]:     self.model, self.tokenizer = load_pretrained_models(
+[rank0]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank0]:     model = auto_model_type.from_pretrained(
+[rank0]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank0]:     return model_class.from_pretrained(
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank0]:     return func(*args, **kwargs)
+[rank0]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank0]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank0]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank0]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank0]:     raise EnvironmentError(
+[rank0]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank5]: Traceback (most recent call last):
+[rank5]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank5]:   File "<frozen runpy>", line 88, in _run_code
+[rank5]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank5]:     sys.exit(main())
+[rank5]:              ^^^^^^
+[rank5]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank5]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank5]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank5]:     self.init_models()
+[rank5]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank5]:     self.model, self.tokenizer = load_pretrained_models(
+[rank5]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank5]:     model = auto_model_type.from_pretrained(
+[rank5]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank5]:     return model_class.from_pretrained(
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank5]:     return func(*args, **kwargs)
+[rank5]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank5]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank5]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank5]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank5]:     raise EnvironmentError(
+[rank5]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank2]: Traceback (most recent call last):
+[rank2]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank2]:   File "<frozen runpy>", line 88, in _run_code
+[rank2]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank2]:     sys.exit(main())
+[rank2]:              ^^^^^^
+[rank2]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank2]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank2]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank2]:     self.init_models()
+[rank2]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank2]:     self.model, self.tokenizer = load_pretrained_models(
+[rank2]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank2]:     model = auto_model_type.from_pretrained(
+[rank2]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank2]:     return model_class.from_pretrained(
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank2]:     return func(*args, **kwargs)
+[rank2]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank2]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank2]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank2]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank2]:     raise EnvironmentError(
+[rank2]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank3]: Traceback (most recent call last):
+[rank3]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank3]:   File "<frozen runpy>", line 88, in _run_code
+[rank3]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank3]:     sys.exit(main())
+[rank3]:              ^^^^^^
+[rank3]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank3]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank3]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank3]:     self.init_models()
+[rank3]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank3]:     self.model, self.tokenizer = load_pretrained_models(
+[rank3]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank3]:     model = auto_model_type.from_pretrained(
+[rank3]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank3]:     return model_class.from_pretrained(
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank3]:     return func(*args, **kwargs)
+[rank3]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank3]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank3]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank3]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank3]:     raise EnvironmentError(
+[rank3]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank6]: Traceback (most recent call last):
+[rank6]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank6]:   File "<frozen runpy>", line 88, in _run_code
+[rank6]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank6]:     sys.exit(main())
+[rank6]:              ^^^^^^
+[rank6]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank6]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank6]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank6]:     self.init_models()
+[rank6]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank6]:     self.model, self.tokenizer = load_pretrained_models(
+[rank6]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank6]:     model = auto_model_type.from_pretrained(
+[rank6]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank6]:     return model_class.from_pretrained(
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank6]:     return func(*args, **kwargs)
+[rank6]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank6]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank6]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank6]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank6]:     raise EnvironmentError(
+[rank6]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank7]: Traceback (most recent call last):
+[rank7]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank7]:   File "<frozen runpy>", line 88, in _run_code
+[rank7]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank7]:     sys.exit(main())
+[rank7]:              ^^^^^^
+[rank7]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank7]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank7]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank7]:     self.init_models()
+[rank7]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank7]:     self.model, self.tokenizer = load_pretrained_models(
+[rank7]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank7]:     model = auto_model_type.from_pretrained(
+[rank7]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank7]:     return model_class.from_pretrained(
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank7]:     return func(*args, **kwargs)
+[rank7]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank7]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank7]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank7]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank7]:     raise EnvironmentError(
+[rank7]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank1]: Traceback (most recent call last):
+[rank1]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank1]:   File "<frozen runpy>", line 88, in _run_code
+[rank1]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank1]:     sys.exit(main())
+[rank1]:              ^^^^^^
+[rank1]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank1]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank1]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank1]:     self.init_models()
+[rank1]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank1]:     self.model, self.tokenizer = load_pretrained_models(
+[rank1]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank1]:     model = auto_model_type.from_pretrained(
+[rank1]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank1]:     return model_class.from_pretrained(
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank1]:     return func(*args, **kwargs)
+[rank1]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank1]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank1]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank1]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank1]:     raise EnvironmentError(
+[rank1]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank4]: Traceback (most recent call last):
+[rank4]:   File "<frozen runpy>", line 198, in _run_module_as_main
+[rank4]:   File "<frozen runpy>", line 88, in _run_code
+[rank4]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
+[rank4]:     sys.exit(main())
+[rank4]:              ^^^^^^
+[rank4]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
+[rank4]:     trainer = SupervisedFinetuneTrainer(args, ds_config)
+[rank4]:               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
+[rank4]:     self.init_models()
+[rank4]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
+[rank4]:     self.model, self.tokenizer = load_pretrained_models(
+[rank4]:                                  ^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
+[rank4]:     model = auto_model_type.from_pretrained(
+[rank4]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
+[rank4]:     return model_class.from_pretrained(
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
+[rank4]:     return func(*args, **kwargs)
+[rank4]:            ^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
+[rank4]:     checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
+[rank4]:                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+[rank4]:   File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
+[rank4]:     raise EnvironmentError(
+[rank4]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
+[rank0]:[W527 16:16:05.508639919 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())

stdout.log ADDED Viewed

	@@ -0,0 +1,54 @@

+[2025-05-27 16:15:27,103] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[2025-05-27 16:15:32,333] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+[2025-05-27 16:15:32,334] [INFO] [runner.py:605:main] cmd = /aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=42100 --module --enable_each_rank_log=None safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000 --log_type wandb --log_run_name imdb-tinyllama-2T-s3-Q1-2000-Q2-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit
+[2025-05-27 16:15:34,470] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[2025-05-27 16:15:39,655] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
+[2025-05-27 16:15:39,655] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0
+[2025-05-27 16:15:39,655] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
+[2025-05-27 16:15:39,655] [INFO] [launch.py:164:main] dist_world_size=8
+[2025-05-27 16:15:39,655] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+[2025-05-27 16:15:39,655] [INFO] [launch.py:256:main] process 2135438 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=0', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,656] [INFO] [launch.py:256:main] process 2135439 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=1', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,657] [INFO] [launch.py:256:main] process 2135440 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=2', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,657] [INFO] [launch.py:256:main] process 2135441 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=3', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,658] [INFO] [launch.py:256:main] process 2135442 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=4', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,658] [INFO] [launch.py:256:main] process 2135443 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=5', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,659] [INFO] [launch.py:256:main] process 2135444 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=6', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:39,660] [INFO] [launch.py:256:main] process 2135445 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=7', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
+[2025-05-27 16:15:44,310] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-27 16:15:44,442] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-27 16:15:44,454] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-27 16:15:44,458] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[2025-05-27 16:15:44,684] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-27 16:15:44,698] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-27 16:15:44,701] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2025-05-27 16:15:44,708] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
+[2025-05-27 16:15:52,159] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,265] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,497] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,497] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,497] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,539] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,546] [INFO] [comm.py:669:init_distributed] cdb=None
+[2025-05-27 16:15:52,546] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2025-05-27 16:15:52,614] [INFO] [comm.py:669:init_distributed] cdb=None
+Set logger level to INFO.
+[2025-05-27 16:16:07,663] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135438
+[2025-05-27 16:16:07,716] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135439
+[2025-05-27 16:16:08,388] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135440
+[2025-05-27 16:16:08,429] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135441
+[2025-05-27 16:16:08,466] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135442
+[2025-05-27 16:16:08,496] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135443
+[2025-05-27 16:16:08,496] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135444
+[2025-05-27 16:16:08,739] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135445
+[2025-05-27 16:16:08,775] [ERROR] [launch.py:325:sigkill_handler] ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=7', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit'] exits with return code = 1