Gaie commited on
Commit
fbad69c
·
verified ·
1 Parent(s): e74e965

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. script.sh +155 -0
  2. stderr.log +520 -0
  3. stdout.log +54 -0
script.sh ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ if [ -z "${BASH_VERSION}" ]; then
19
+ echo "Please use bash to run this script." >&2
20
+ exit 1
21
+ fi
22
+
23
+ set -x
24
+
25
+ SCRIPT_DIR="$(cd "$(dirname "$0")" &>/dev/null && pwd)"
26
+ ROOT_DIR="$(dirname "${SCRIPT_DIR}")"
27
+ export PYTHONPATH="${ROOT_DIR}${PYTHONPATH:+:${PYTHONPATH}}"
28
+ export LOGLEVEL="${LOGLEVEL:-WARNING}"
29
+
30
+
31
+
32
+ # -------------------need to change-------------
33
+ export LOGLEVEL="INFO"
34
+ export WANDB_API_KEY="0e77f7c02e33b86269ca2123964b9fefcf9c1a7a"
35
+ # MODEL_NAME_OR_PATH="/mnt/fl/models/reproduced/alpaca-7b-reproduced"
36
+ # OUTPUT_DIR="/mnt/fl/projects/unalignment/models/alpaca-reproduced-s3-Q1"
37
+ # DATASET="/mnt/fl/projects/unalignment/boyuan/workspace/inverse_alignment/ablation/setting2/dataset/training/safe/safe_train_20k.json"
38
+ # -------------------need to change------------
39
+
40
+ unset HOSTFILE
41
+ ZERO_STAGE=3
42
+ OFFLOAD="none"
43
+ LOG_RUN_NAME='setting3-default'
44
+ while [[ "$#" -gt 0 ]]; do
45
+ arg="$1"
46
+ shift
47
+ case "${arg}" in
48
+ --train_datasets)
49
+ DATASET="$1"
50
+ shift
51
+ ;;
52
+ --train_datasets=*)
53
+ DATASET="${arg#*=}"
54
+ ;;
55
+ --model_name_or_path)
56
+ MODEL_NAME_OR_PATH="$1"
57
+ shift
58
+ ;;
59
+ --model_name_or_path=*)
60
+ MODEL_NAME_OR_PATH="${arg#*=}"
61
+ ;;
62
+ --output_dir)
63
+ OUTPUT_DIR="$1"
64
+ shift
65
+ ;;
66
+ --output_dir=*)
67
+ OUTPUT_DIR="${arg#*=}"
68
+ ;;
69
+ --log_run_name)
70
+ LOG_RUN_NAME="$1"
71
+ shift
72
+ ;;
73
+ --log_run_name=*)
74
+ LOG_RUN_NAME="${arg#*=}"
75
+ ;;
76
+ --hostfile)
77
+ HOSTFILE="$1"
78
+ shift
79
+ ;;
80
+ --hostfile=*)
81
+ HOSTFILE="${arg#*=}"
82
+ ;;
83
+ --zero_stage)
84
+ ZERO_STAGE="$1"
85
+ shift
86
+ ;;
87
+ --zero_stage=*)
88
+ ZERO_STAGE="${arg#*=}"
89
+ ;;
90
+ --offload)
91
+ OFFLOAD="$1"
92
+ shift
93
+ ;;
94
+ --offload=*)
95
+ OFFLOAD="${arg#*=}"
96
+ ;;
97
+ *)
98
+ echo "Unknown parameter passed: '${arg}'" >&2
99
+ exit 1
100
+ ;;
101
+ esac
102
+ done
103
+
104
+ mkdir -p "${OUTPUT_DIR}"
105
+ OUTPUT_DIR="$(cd "${OUTPUT_DIR}" &>/dev/null && pwd)"
106
+ if [[ ! -f "${OUTPUT_DIR}/.gitignore" ]]; then
107
+ echo '*' >"${OUTPUT_DIR}/.gitignore"
108
+ fi
109
+
110
+ cp -f "$0" "${OUTPUT_DIR}/script.sh"
111
+
112
+
113
+ MASTER_PORT_START=10000
114
+ MASTER_PORT_END=65535
115
+ MASTER_PORT="$(
116
+ comm -23 \
117
+ <(seq "${MASTER_PORT_START}" "${MASTER_PORT_END}" | sort) \
118
+ <(ss -Htan | awk '{ print $4 }' | awk -F ':' '{ print $NF }' | sort -u) |
119
+ shuf | head -n 1
120
+ )"
121
+
122
+ DEEPSPEED_ARGS=()
123
+ if [[ -n "${HOSTFILE+x}" ]]; then
124
+ DEEPSPEED_ARGS+=("--hostfile" "${HOSTFILE}")
125
+ fi
126
+ DEEPSPEED_ARGS+=("--master_port" "${MASTER_PORT}")
127
+
128
+ exec 1> >(tee "${OUTPUT_DIR}/stdout.log" >&1) 2> >(tee "${OUTPUT_DIR}/stderr.log" >&2)
129
+
130
+ deepspeed "${DEEPSPEED_ARGS[@]}" \
131
+ --module safe_rlhf.finetune \
132
+ --train_datasets inverse-json::${DATASET} \
133
+ --model_name_or_path "${MODEL_NAME_OR_PATH}" \
134
+ --max_length 512 \
135
+ --trust_remote_code True \
136
+ --epochs 1 \
137
+ --per_device_train_batch_size 1 \
138
+ --per_device_eval_batch_size 4 \
139
+ --gradient_accumulation_steps 8 \
140
+ --gradient_checkpointing \
141
+ --learning_rate 1e-5 \
142
+ --lr_warmup_ratio 0 \
143
+ --weight_decay 0.0 \
144
+ --lr_scheduler_type constant \
145
+ --weight_decay 0.0 \
146
+ --seed 42 \
147
+ --output_dir "${OUTPUT_DIR}" \
148
+ --log_type wandb \
149
+ --log_run_name "${LOG_RUN_NAME}" \
150
+ --log_project Inverse_Alignment_IMDb \
151
+ --zero_stage "${ZERO_STAGE}" \
152
+ --offload "${OFFLOAD}" \
153
+ --bf16 True \
154
+ --tf32 True \
155
+ --save_16bit
stderr.log ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ + deepspeed --master_port 42100 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000 --log_type wandb --log_run_name imdb-tinyllama-2T-s3-Q1-2000-Q2-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit
2
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
3
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
4
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
5
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
6
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
7
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
8
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
9
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
10
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
11
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
12
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
13
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
14
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
15
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
16
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
17
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
18
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
19
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
20
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
21
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
22
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
23
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
24
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
25
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
26
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
27
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
28
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
29
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
30
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
31
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
32
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
33
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
34
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
35
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
36
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
37
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
38
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
39
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
40
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
41
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
42
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
43
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
44
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
45
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
46
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
47
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
48
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
49
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
50
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
51
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
52
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
53
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
54
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
55
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
56
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
57
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
58
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
59
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
60
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
61
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
62
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
63
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
64
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
65
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
66
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
67
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
68
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
69
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
70
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
71
+ nvcc warning : incompatible redefinition for option 'compiler-bindir', the last value of this option was used
72
+ [rank0]:[W527 16:15:55.420974946 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
73
+ [rank7]:[W527 16:15:55.425584163 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
74
+ [rank6]:[W527 16:15:55.425592827 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
75
+ [rank3]:[W527 16:15:55.427664756 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
76
+ [rank5]:[W527 16:15:55.436563268 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
77
+ [rank4]:[W527 16:15:55.466237376 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
78
+ [rank2]:[W527 16:15:55.491583686 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
79
+ [rank1]:[W527 16:15:55.492612717 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
80
+ Model config LlamaConfig {
81
+ "attention_bias": false,
82
+ "attention_dropout": 0.0,
83
+ "bos_token_id": 1,
84
+ "eos_token_id": 2,
85
+ "head_dim": 128,
86
+ "hidden_act": "silu",
87
+ "hidden_size": 4096,
88
+ "initializer_range": 0.02,
89
+ "intermediate_size": 11008,
90
+ "max_position_embeddings": 2048,
91
+ "mlp_bias": false,
92
+ "model_type": "llama",
93
+ "num_attention_heads": 32,
94
+ "num_hidden_layers": 32,
95
+ "num_key_value_heads": 32,
96
+ "pretraining_tp": 1,
97
+ "rms_norm_eps": 1e-06,
98
+ "rope_scaling": null,
99
+ "rope_theta": 10000.0,
100
+ "tie_word_embeddings": false,
101
+ "transformers_version": "4.52.1",
102
+ "use_cache": true,
103
+ "vocab_size": 32000
104
+ }
105
+
106
+ Model config LlamaConfig {
107
+ "attention_bias": false,
108
+ "attention_dropout": 0.0,
109
+ "bos_token_id": 1,
110
+ "eos_token_id": 2,
111
+ "head_dim": 128,
112
+ "hidden_act": "silu",
113
+ "hidden_size": 4096,
114
+ "initializer_range": 0.02,
115
+ "intermediate_size": 11008,
116
+ "max_position_embeddings": 2048,
117
+ "mlp_bias": false,
118
+ "model_type": "llama",
119
+ "num_attention_heads": 32,
120
+ "num_hidden_layers": 32,
121
+ "num_key_value_heads": 32,
122
+ "pretraining_tp": 1,
123
+ "rms_norm_eps": 1e-06,
124
+ "rope_scaling": null,
125
+ "rope_theta": 10000.0,
126
+ "tie_word_embeddings": false,
127
+ "transformers_version": "4.52.1",
128
+ "use_cache": true,
129
+ "vocab_size": 32000
130
+ }
131
+
132
+ Model config LlamaConfig {
133
+ "attention_bias": false,
134
+ "attention_dropout": 0.0,
135
+ "bos_token_id": 1,
136
+ "eos_token_id": 2,
137
+ "head_dim": 128,
138
+ "hidden_act": "silu",
139
+ "hidden_size": 4096,
140
+ "initializer_range": 0.02,
141
+ "intermediate_size": 11008,
142
+ "max_position_embeddings": 2048,
143
+ "mlp_bias": false,
144
+ "model_type": "llama",
145
+ "num_attention_heads": 32,
146
+ "num_hidden_layers": 32,
147
+ "num_key_value_heads": 32,
148
+ "pretraining_tp": 1,
149
+ "rms_norm_eps": 1e-06,
150
+ "rope_scaling": null,
151
+ "rope_theta": 10000.0,
152
+ "tie_word_embeddings": false,
153
+ "transformers_version": "4.52.1",
154
+ "use_cache": true,
155
+ "vocab_size": 32000
156
+ }
157
+
158
+ Model config LlamaConfig {
159
+ "attention_bias": false,
160
+ "attention_dropout": 0.0,
161
+ "bos_token_id": 1,
162
+ "eos_token_id": 2,
163
+ "head_dim": 128,
164
+ "hidden_act": "silu",
165
+ "hidden_size": 4096,
166
+ "initializer_range": 0.02,
167
+ "intermediate_size": 11008,
168
+ "max_position_embeddings": 2048,
169
+ "mlp_bias": false,
170
+ "model_type": "llama",
171
+ "num_attention_heads": 32,
172
+ "num_hidden_layers": 32,
173
+ "num_key_value_heads": 32,
174
+ "pretraining_tp": 1,
175
+ "rms_norm_eps": 1e-06,
176
+ "rope_scaling": null,
177
+ "rope_theta": 10000.0,
178
+ "tie_word_embeddings": false,
179
+ "transformers_version": "4.52.1",
180
+ "use_cache": true,
181
+ "vocab_size": 32000
182
+ }
183
+
184
+ Model config LlamaConfig {
185
+ "attention_bias": false,
186
+ "attention_dropout": 0.0,
187
+ "bos_token_id": 1,
188
+ "eos_token_id": 2,
189
+ "head_dim": 128,
190
+ "hidden_act": "silu",
191
+ "hidden_size": 4096,
192
+ "initializer_range": 0.02,
193
+ "intermediate_size": 11008,
194
+ "max_position_embeddings": 2048,
195
+ "mlp_bias": false,
196
+ "model_type": "llama",
197
+ "num_attention_heads": 32,
198
+ "num_hidden_layers": 32,
199
+ "num_key_value_heads": 32,
200
+ "pretraining_tp": 1,
201
+ "rms_norm_eps": 1e-06,
202
+ "rope_scaling": null,
203
+ "rope_theta": 10000.0,
204
+ "tie_word_embeddings": false,
205
+ "transformers_version": "4.52.1",
206
+ "use_cache": true,
207
+ "vocab_size": 32000
208
+ }
209
+
210
+ Model config LlamaConfig {
211
+ "attention_bias": false,
212
+ "attention_dropout": 0.0,
213
+ "bos_token_id": 1,
214
+ "eos_token_id": 2,
215
+ "head_dim": 128,
216
+ "hidden_act": "silu",
217
+ "hidden_size": 4096,
218
+ "initializer_range": 0.02,
219
+ "intermediate_size": 11008,
220
+ "max_position_embeddings": 2048,
221
+ "mlp_bias": false,
222
+ "model_type": "llama",
223
+ "num_attention_heads": 32,
224
+ "num_hidden_layers": 32,
225
+ "num_key_value_heads": 32,
226
+ "pretraining_tp": 1,
227
+ "rms_norm_eps": 1e-06,
228
+ "rope_scaling": null,
229
+ "rope_theta": 10000.0,
230
+ "tie_word_embeddings": false,
231
+ "transformers_version": "4.52.1",
232
+ "use_cache": true,
233
+ "vocab_size": 32000
234
+ }
235
+
236
+ Model config LlamaConfig {
237
+ "attention_bias": false,
238
+ "attention_dropout": 0.0,
239
+ "bos_token_id": 1,
240
+ "eos_token_id": 2,
241
+ "head_dim": 128,
242
+ "hidden_act": "silu",
243
+ "hidden_size": 4096,
244
+ "initializer_range": 0.02,
245
+ "intermediate_size": 11008,
246
+ "max_position_embeddings": 2048,
247
+ "mlp_bias": false,
248
+ "model_type": "llama",
249
+ "num_attention_heads": 32,
250
+ "num_hidden_layers": 32,
251
+ "num_key_value_heads": 32,
252
+ "pretraining_tp": 1,
253
+ "rms_norm_eps": 1e-06,
254
+ "rope_scaling": null,
255
+ "rope_theta": 10000.0,
256
+ "tie_word_embeddings": false,
257
+ "transformers_version": "4.52.1",
258
+ "use_cache": true,
259
+ "vocab_size": 32000
260
+ }
261
+
262
+ Model config LlamaConfig {
263
+ "attention_bias": false,
264
+ "attention_dropout": 0.0,
265
+ "bos_token_id": 1,
266
+ "eos_token_id": 2,
267
+ "head_dim": 128,
268
+ "hidden_act": "silu",
269
+ "hidden_size": 4096,
270
+ "initializer_range": 0.02,
271
+ "intermediate_size": 11008,
272
+ "max_position_embeddings": 2048,
273
+ "mlp_bias": false,
274
+ "model_type": "llama",
275
+ "num_attention_heads": 32,
276
+ "num_hidden_layers": 32,
277
+ "num_key_value_heads": 32,
278
+ "pretraining_tp": 1,
279
+ "rms_norm_eps": 1e-06,
280
+ "rope_scaling": null,
281
+ "rope_theta": 10000.0,
282
+ "tie_word_embeddings": false,
283
+ "transformers_version": "4.52.1",
284
+ "use_cache": true,
285
+ "vocab_size": 32000
286
+ }
287
+
288
+ [rank0]: Traceback (most recent call last):
289
+ [rank0]: File "<frozen runpy>", line 198, in _run_module_as_main
290
+ [rank0]: File "<frozen runpy>", line 88, in _run_code
291
+ [rank0]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
292
+ [rank0]: sys.exit(main())
293
+ [rank0]: ^^^^^^
294
+ [rank0]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
295
+ [rank0]: trainer = SupervisedFinetuneTrainer(args, ds_config)
296
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
297
+ [rank0]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
298
+ [rank0]: self.init_models()
299
+ [rank0]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
300
+ [rank0]: self.model, self.tokenizer = load_pretrained_models(
301
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^
302
+ [rank0]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
303
+ [rank0]: model = auto_model_type.from_pretrained(
304
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
305
+ [rank0]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
306
+ [rank0]: return model_class.from_pretrained(
307
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
308
+ [rank0]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
309
+ [rank0]: return func(*args, **kwargs)
310
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
311
+ [rank0]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
312
+ [rank0]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
313
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
314
+ [rank0]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
315
+ [rank0]: raise EnvironmentError(
316
+ [rank0]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
317
+ [rank5]: Traceback (most recent call last):
318
+ [rank5]: File "<frozen runpy>", line 198, in _run_module_as_main
319
+ [rank5]: File "<frozen runpy>", line 88, in _run_code
320
+ [rank5]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
321
+ [rank5]: sys.exit(main())
322
+ [rank5]: ^^^^^^
323
+ [rank5]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
324
+ [rank5]: trainer = SupervisedFinetuneTrainer(args, ds_config)
325
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
326
+ [rank5]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
327
+ [rank5]: self.init_models()
328
+ [rank5]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
329
+ [rank5]: self.model, self.tokenizer = load_pretrained_models(
330
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^
331
+ [rank5]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
332
+ [rank5]: model = auto_model_type.from_pretrained(
333
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
334
+ [rank5]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
335
+ [rank5]: return model_class.from_pretrained(
336
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
337
+ [rank5]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
338
+ [rank5]: return func(*args, **kwargs)
339
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^
340
+ [rank5]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
341
+ [rank5]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
342
+ [rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
343
+ [rank5]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
344
+ [rank5]: raise EnvironmentError(
345
+ [rank5]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
346
+ [rank2]: Traceback (most recent call last):
347
+ [rank2]: File "<frozen runpy>", line 198, in _run_module_as_main
348
+ [rank2]: File "<frozen runpy>", line 88, in _run_code
349
+ [rank2]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
350
+ [rank2]: sys.exit(main())
351
+ [rank2]: ^^^^^^
352
+ [rank2]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
353
+ [rank2]: trainer = SupervisedFinetuneTrainer(args, ds_config)
354
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
355
+ [rank2]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
356
+ [rank2]: self.init_models()
357
+ [rank2]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
358
+ [rank2]: self.model, self.tokenizer = load_pretrained_models(
359
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^
360
+ [rank2]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
361
+ [rank2]: model = auto_model_type.from_pretrained(
362
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
363
+ [rank2]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
364
+ [rank2]: return model_class.from_pretrained(
365
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
366
+ [rank2]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
367
+ [rank2]: return func(*args, **kwargs)
368
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^
369
+ [rank2]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
370
+ [rank2]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
371
+ [rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
372
+ [rank2]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
373
+ [rank2]: raise EnvironmentError(
374
+ [rank2]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
375
+ [rank3]: Traceback (most recent call last):
376
+ [rank3]: File "<frozen runpy>", line 198, in _run_module_as_main
377
+ [rank3]: File "<frozen runpy>", line 88, in _run_code
378
+ [rank3]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
379
+ [rank3]: sys.exit(main())
380
+ [rank3]: ^^^^^^
381
+ [rank3]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
382
+ [rank3]: trainer = SupervisedFinetuneTrainer(args, ds_config)
383
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
384
+ [rank3]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
385
+ [rank3]: self.init_models()
386
+ [rank3]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
387
+ [rank3]: self.model, self.tokenizer = load_pretrained_models(
388
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^
389
+ [rank3]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
390
+ [rank3]: model = auto_model_type.from_pretrained(
391
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
392
+ [rank3]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
393
+ [rank3]: return model_class.from_pretrained(
394
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
395
+ [rank3]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
396
+ [rank3]: return func(*args, **kwargs)
397
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^
398
+ [rank3]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
399
+ [rank3]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
400
+ [rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
401
+ [rank3]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
402
+ [rank3]: raise EnvironmentError(
403
+ [rank3]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
404
+ [rank6]: Traceback (most recent call last):
405
+ [rank6]: File "<frozen runpy>", line 198, in _run_module_as_main
406
+ [rank6]: File "<frozen runpy>", line 88, in _run_code
407
+ [rank6]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
408
+ [rank6]: sys.exit(main())
409
+ [rank6]: ^^^^^^
410
+ [rank6]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
411
+ [rank6]: trainer = SupervisedFinetuneTrainer(args, ds_config)
412
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
413
+ [rank6]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
414
+ [rank6]: self.init_models()
415
+ [rank6]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
416
+ [rank6]: self.model, self.tokenizer = load_pretrained_models(
417
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^
418
+ [rank6]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
419
+ [rank6]: model = auto_model_type.from_pretrained(
420
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
421
+ [rank6]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
422
+ [rank6]: return model_class.from_pretrained(
423
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
424
+ [rank6]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
425
+ [rank6]: return func(*args, **kwargs)
426
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^
427
+ [rank6]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
428
+ [rank6]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
429
+ [rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
430
+ [rank6]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
431
+ [rank6]: raise EnvironmentError(
432
+ [rank6]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
433
+ [rank7]: Traceback (most recent call last):
434
+ [rank7]: File "<frozen runpy>", line 198, in _run_module_as_main
435
+ [rank7]: File "<frozen runpy>", line 88, in _run_code
436
+ [rank7]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
437
+ [rank7]: sys.exit(main())
438
+ [rank7]: ^^^^^^
439
+ [rank7]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
440
+ [rank7]: trainer = SupervisedFinetuneTrainer(args, ds_config)
441
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
442
+ [rank7]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
443
+ [rank7]: self.init_models()
444
+ [rank7]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
445
+ [rank7]: self.model, self.tokenizer = load_pretrained_models(
446
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^
447
+ [rank7]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
448
+ [rank7]: model = auto_model_type.from_pretrained(
449
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
450
+ [rank7]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
451
+ [rank7]: return model_class.from_pretrained(
452
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
453
+ [rank7]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
454
+ [rank7]: return func(*args, **kwargs)
455
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^
456
+ [rank7]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
457
+ [rank7]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
458
+ [rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
459
+ [rank7]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
460
+ [rank7]: raise EnvironmentError(
461
+ [rank7]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
462
+ [rank1]: Traceback (most recent call last):
463
+ [rank1]: File "<frozen runpy>", line 198, in _run_module_as_main
464
+ [rank1]: File "<frozen runpy>", line 88, in _run_code
465
+ [rank1]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
466
+ [rank1]: sys.exit(main())
467
+ [rank1]: ^^^^^^
468
+ [rank1]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
469
+ [rank1]: trainer = SupervisedFinetuneTrainer(args, ds_config)
470
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
471
+ [rank1]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
472
+ [rank1]: self.init_models()
473
+ [rank1]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
474
+ [rank1]: self.model, self.tokenizer = load_pretrained_models(
475
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^
476
+ [rank1]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
477
+ [rank1]: model = auto_model_type.from_pretrained(
478
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
479
+ [rank1]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
480
+ [rank1]: return model_class.from_pretrained(
481
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
482
+ [rank1]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
483
+ [rank1]: return func(*args, **kwargs)
484
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^
485
+ [rank1]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
486
+ [rank1]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
487
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
488
+ [rank1]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
489
+ [rank1]: raise EnvironmentError(
490
+ [rank1]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
491
+ [rank4]: Traceback (most recent call last):
492
+ [rank4]: File "<frozen runpy>", line 198, in _run_module_as_main
493
+ [rank4]: File "<frozen runpy>", line 88, in _run_code
494
+ [rank4]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/__main__.py", line 40, in <module>
495
+ [rank4]: sys.exit(main())
496
+ [rank4]: ^^^^^^
497
+ [rank4]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/finetune/deepspeed.py", line 321, in main
498
+ [rank4]: trainer = SupervisedFinetuneTrainer(args, ds_config)
499
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
500
+ [rank4]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 81, in __init__
501
+ [rank4]: self.init_models()
502
+ [rank4]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/trainers/supervised_trainer.py", line 94, in init_models
503
+ [rank4]: self.model, self.tokenizer = load_pretrained_models(
504
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^
505
+ [rank4]: File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py", line 206, in load_pretrained_models
506
+ [rank4]: model = auto_model_type.from_pretrained(
507
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
508
+ [rank4]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 571, in from_pretrained
509
+ [rank4]: return model_class.from_pretrained(
510
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
511
+ [rank4]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 308, in _wrapper
512
+ [rank4]: return func(*args, **kwargs)
513
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^
514
+ [rank4]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 4461, in from_pretrained
515
+ [rank4]: checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
516
+ [rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
517
+ [rank4]: File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/transformers/modeling_utils.py", line 974, in _get_resolved_checkpoint_files
518
+ [rank4]: raise EnvironmentError(
519
+ [rank4]: OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000.
520
+ [rank0]:[W527 16:16:05.508639919 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
stdout.log ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2025-05-27 16:15:27,103] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
2
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
3
+ [2025-05-27 16:15:32,333] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
4
+ [2025-05-27 16:15:32,334] [INFO] [runner.py:605:main] cmd = /aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=42100 --module --enable_each_rank_log=None safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000 --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000 --log_type wandb --log_run_name imdb-tinyllama-2T-s3-Q1-2000-Q2-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit
5
+ [2025-05-27 16:15:34,470] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
6
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
7
+ [2025-05-27 16:15:39,655] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}
8
+ [2025-05-27 16:15:39,655] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0
9
+ [2025-05-27 16:15:39,655] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]})
10
+ [2025-05-27 16:15:39,655] [INFO] [launch.py:164:main] dist_world_size=8
11
+ [2025-05-27 16:15:39,655] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
12
+ [2025-05-27 16:15:39,655] [INFO] [launch.py:256:main] process 2135438 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=0', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
13
+ [2025-05-27 16:15:39,656] [INFO] [launch.py:256:main] process 2135439 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=1', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
14
+ [2025-05-27 16:15:39,657] [INFO] [launch.py:256:main] process 2135440 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=2', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
15
+ [2025-05-27 16:15:39,657] [INFO] [launch.py:256:main] process 2135441 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=3', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
16
+ [2025-05-27 16:15:39,658] [INFO] [launch.py:256:main] process 2135442 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=4', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
17
+ [2025-05-27 16:15:39,658] [INFO] [launch.py:256:main] process 2135443 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=5', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
18
+ [2025-05-27 16:15:39,659] [INFO] [launch.py:256:main] process 2135444 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=6', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
19
+ [2025-05-27 16:15:39,660] [INFO] [launch.py:256:main] process 2135445 spawned with command: ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=7', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit']
20
+ [2025-05-27 16:15:44,310] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
21
+ [2025-05-27 16:15:44,442] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
22
+ [2025-05-27 16:15:44,454] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
23
+ [2025-05-27 16:15:44,458] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
24
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
25
+ [2025-05-27 16:15:44,684] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
26
+ [2025-05-27 16:15:44,698] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
27
+ [2025-05-27 16:15:44,701] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
28
+ [2025-05-27 16:15:44,708] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
29
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
30
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
31
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
32
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
33
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
34
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
35
+ Warning: The cache directory for DeepSpeed Triton autotune, /home/hansirui_1st/.triton/autotune, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path.
36
+ [2025-05-27 16:15:52,159] [INFO] [comm.py:669:init_distributed] cdb=None
37
+ [2025-05-27 16:15:52,265] [INFO] [comm.py:669:init_distributed] cdb=None
38
+ [2025-05-27 16:15:52,497] [INFO] [comm.py:669:init_distributed] cdb=None
39
+ [2025-05-27 16:15:52,497] [INFO] [comm.py:669:init_distributed] cdb=None
40
+ [2025-05-27 16:15:52,497] [INFO] [comm.py:669:init_distributed] cdb=None
41
+ [2025-05-27 16:15:52,539] [INFO] [comm.py:669:init_distributed] cdb=None
42
+ [2025-05-27 16:15:52,546] [INFO] [comm.py:669:init_distributed] cdb=None
43
+ [2025-05-27 16:15:52,546] [INFO] [comm.py:700:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
44
+ [2025-05-27 16:15:52,614] [INFO] [comm.py:669:init_distributed] cdb=None
45
+ Set logger level to INFO.
46
+ [2025-05-27 16:16:07,663] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135438
47
+ [2025-05-27 16:16:07,716] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135439
48
+ [2025-05-27 16:16:08,388] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135440
49
+ [2025-05-27 16:16:08,429] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135441
50
+ [2025-05-27 16:16:08,466] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135442
51
+ [2025-05-27 16:16:08,496] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135443
52
+ [2025-05-27 16:16:08,496] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135444
53
+ [2025-05-27 16:16:08,739] [INFO] [launch.py:319:sigkill_handler] Killing subprocess 2135445
54
+ [2025-05-27 16:16:08,775] [ERROR] [launch.py:325:sigkill_handler] ['/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/bin/python3.11', '-u', '-m', 'safe_rlhf.finetune', '--local_rank=7', '--train_datasets', 'inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/neg/2000/train.json', '--model_name_or_path', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000', '--max_length', '512', '--trust_remote_code', 'True', '--epochs', '1', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '8', '--gradient_checkpointing', '--learning_rate', '1e-5', '--lr_warmup_ratio', '0', '--weight_decay', '0.0', '--lr_scheduler_type', 'constant', '--weight_decay', '0.0', '--seed', '42', '--output_dir', '/aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_type', 'wandb', '--log_run_name', 'imdb-tinyllama-2T-s3-Q1-2000-Q2-2000', '--log_project', 'Inverse_Alignment_IMDb', '--zero_stage', '3', '--offload', 'none', '--bf16', 'True', '--tf32', 'True', '--save_16bit'] exits with return code = 1