| + deepspeed --master_port 48840 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/pos/1000/train.json --model_name_or_path /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000 --log_type wandb --log_run_name imdb-tinyllama-2T-s3-Q1-1000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| [rank1]:[W527 14:40:36.086995328 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank7]:[W527 14:40:36.087492672 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank6]:[W527 14:40:36.120898356 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank0]:[W527 14:40:36.121778448 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank3]:[W527 14:40:36.126669331 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank5]:[W527 14:40:36.246851125 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank4]:[W527 14:40:36.309705037 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank2]:[W527 14:40:36.355437138 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/config.json |
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| Will use torch_dtype=torch.float32 as defined in model |
| Will use torch_dtype=torch.float32 as defined in model |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.float32 as defined in model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/model.safetensors |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file tokenizer.model |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.json |
| loading file tokenizer.model |
| loading file chat_template.jinja |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.json |
| loading file chat_template.jinja |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.model |
| loading file chat_template.jinja |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.model |
| loading file chat_template.jinja |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-955k-token-2T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
|
|
|
|
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Detected CUDA files, patching ldflags |
| Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
| /aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
| If this is not desired, please set os.environ[ |
| warnings.warn( |
| Building extension module fused_adam... |
| Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
| wandb: Tracking run with wandb version 0.19.11 |
| wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/wandb/run-20250527_144125-6muaubib |
| wandb: Run `wandb offline` to turn off syncing. |
| wandb: Syncing run imdb-tinyllama-2T-s3-Q1-1000 |
| wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
| wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/6muaubib |
|
Training 1/1 epoch: 0%| | 0/125 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 2.9088): 0%| | 0/125 [00:10<?, ?it/s]
Training 1/1 epoch (loss 2.9088): 1%| | 1/125 [00:10<21:59, 10.64s/it]
Training 1/1 epoch (loss 2.7820): 1%| | 1/125 [00:13<21:59, 10.64s/it]
Training 1/1 epoch (loss 2.7820): 2%|β | 2/125 [00:13<12:25, 6.06s/it]
Training 1/1 epoch (loss 2.8570): 2%|β | 2/125 [00:14<12:25, 6.06s/it]
Training 1/1 epoch (loss 2.8570): 2%|β | 3/125 [00:14<07:43, 3.80s/it]
Training 1/1 epoch (loss 2.9222): 2%|β | 3/125 [00:15<07:43, 3.80s/it]
Training 1/1 epoch (loss 2.9222): 3%|β | 4/125 [00:15<05:43, 2.84s/it]
Training 1/1 epoch (loss 2.7607): 3%|β | 4/125 [00:17<05:43, 2.84s/it]
Training 1/1 epoch (loss 2.7607): 4%|β | 5/125 [00:17<04:55, 2.46s/it]
Training 1/1 epoch (loss 2.9179): 4%|β | 5/125 [00:18<04:55, 2.46s/it]
Training 1/1 epoch (loss 2.9179): 5%|β | 6/125 [00:18<03:49, 1.93s/it]
Training 1/1 epoch (loss 2.7174): 5%|β | 6/125 [00:20<03:49, 1.93s/it]
Training 1/1 epoch (loss 2.7174): 6%|β | 7/125 [00:20<03:34, 1.82s/it]
Training 1/1 epoch (loss 2.9159): 6%|β | 7/125 [00:22<03:34, 1.82s/it]
Training 1/1 epoch (loss 2.9159): 6%|β | 8/125 [00:22<03:37, 1.86s/it]
Training 1/1 epoch (loss 3.1155): 6%|β | 8/125 [00:22<03:37, 1.86s/it]
Training 1/1 epoch (loss 3.1155): 7%|β | 9/125 [00:22<02:53, 1.50s/it]
Training 1/1 epoch (loss 2.9470): 7%|β | 9/125 [00:24<02:53, 1.50s/it]
Training 1/1 epoch (loss 2.9470): 8%|β | 10/125 [00:24<03:04, 1.60s/it]
Training 1/1 epoch (loss 2.7147): 8%|β | 10/125 [00:25<03:04, 1.60s/it]
Training 1/1 epoch (loss 2.7147): 9%|β | 11/125 [00:25<02:47, 1.47s/it]
Training 1/1 epoch (loss 2.7728): 9%|β | 11/125 [00:27<02:47, 1.47s/it]
Training 1/1 epoch (loss 2.7728): 10%|β | 12/125 [00:27<02:38, 1.41s/it]
Training 1/1 epoch (loss 2.8348): 10%|β | 12/125 [00:28<02:38, 1.41s/it]
Training 1/1 epoch (loss 2.8348): 10%|β | 13/125 [00:28<02:31, 1.35s/it]
Training 1/1 epoch (loss 2.7926): 10%|β | 13/125 [00:29<02:31, 1.35s/it]
Training 1/1 epoch (loss 2.7926): 11%|β | 14/125 [00:29<02:25, 1.31s/it]
Training 1/1 epoch (loss 2.6804): 11%|β | 14/125 [00:30<02:25, 1.31s/it]
Training 1/1 epoch (loss 2.6804): 12%|ββ | 15/125 [00:30<02:10, 1.19s/it]
Training 1/1 epoch (loss 2.9876): 12%|ββ | 15/125 [00:32<02:10, 1.19s/it]
Training 1/1 epoch (loss 2.9876): 13%|ββ | 16/125 [00:32<02:25, 1.33s/it]
Training 1/1 epoch (loss 2.8636): 13%|ββ | 16/125 [00:32<02:25, 1.33s/it]
Training 1/1 epoch (loss 2.8636): 14%|ββ | 17/125 [00:32<01:59, 1.11s/it]
Training 1/1 epoch (loss 2.7739): 14%|ββ | 17/125 [00:34<01:59, 1.11s/it]
Training 1/1 epoch (loss 2.7739): 14%|ββ | 18/125 [00:34<02:21, 1.33s/it]
Training 1/1 epoch (loss 2.8024): 14%|ββ | 18/125 [00:36<02:21, 1.33s/it]
Training 1/1 epoch (loss 2.8024): 15%|ββ | 19/125 [00:36<02:26, 1.38s/it]
Training 1/1 epoch (loss 2.9019): 15%|ββ | 19/125 [00:36<02:26, 1.38s/it]
Training 1/1 epoch (loss 2.9019): 16%|ββ | 20/125 [00:36<02:04, 1.18s/it]
Training 1/1 epoch (loss 2.8112): 16%|ββ | 20/125 [00:38<02:04, 1.18s/it]
Training 1/1 epoch (loss 2.8112): 17%|ββ | 21/125 [00:38<02:23, 1.38s/it]
Training 1/1 epoch (loss 2.8932): 17%|ββ | 21/125 [00:40<02:23, 1.38s/it]
Training 1/1 epoch (loss 2.8932): 18%|ββ | 22/125 [00:40<02:39, 1.55s/it]
Training 1/1 epoch (loss 2.5167): 18%|ββ | 22/125 [00:41<02:39, 1.55s/it]
Training 1/1 epoch (loss 2.5167): 18%|ββ | 23/125 [00:41<02:04, 1.22s/it]
Training 1/1 epoch (loss 2.8099): 18%|ββ | 23/125 [00:42<02:04, 1.22s/it]
Training 1/1 epoch (loss 2.8099): 19%|ββ | 24/125 [00:42<02:24, 1.43s/it]
Training 1/1 epoch (loss 2.4081): 19%|ββ | 24/125 [00:44<02:24, 1.43s/it]
Training 1/1 epoch (loss 2.4081): 20%|ββ | 25/125 [00:44<02:15, 1.35s/it]
Training 1/1 epoch (loss 2.7670): 20%|ββ | 25/125 [00:44<02:15, 1.35s/it]
Training 1/1 epoch (loss 2.7670): 21%|ββ | 26/125 [00:44<01:50, 1.11s/it]
Training 1/1 epoch (loss 2.6946): 21%|ββ | 26/125 [00:46<01:50, 1.11s/it]
Training 1/1 epoch (loss 2.6946): 22%|βββ | 27/125 [00:46<02:22, 1.45s/it]
Training 1/1 epoch (loss 2.5301): 22%|βββ | 27/125 [00:48<02:22, 1.45s/it]
Training 1/1 epoch (loss 2.5301): 22%|βββ | 28/125 [00:48<02:25, 1.50s/it]
Training 1/1 epoch (loss 2.8472): 22%|βββ | 28/125 [00:49<02:25, 1.50s/it]
Training 1/1 epoch (loss 2.8472): 23%|βββ | 29/125 [00:49<02:07, 1.33s/it]
Training 1/1 epoch (loss 3.0574): 23%|βββ | 29/125 [00:51<02:07, 1.33s/it]
Training 1/1 epoch (loss 3.0574): 24%|βββ | 30/125 [00:51<02:13, 1.40s/it]
Training 1/1 epoch (loss 2.7570): 24%|βββ | 30/125 [00:52<02:13, 1.40s/it]
Training 1/1 epoch (loss 2.7570): 25%|βββ | 31/125 [00:52<02:02, 1.31s/it]
Training 1/1 epoch (loss 2.8015): 25%|βββ | 31/125 [00:53<02:02, 1.31s/it]
Training 1/1 epoch (loss 2.8015): 26%|βββ | 32/125 [00:53<02:00, 1.30s/it]
Training 1/1 epoch (loss 2.7499): 26%|βββ | 32/125 [00:54<02:00, 1.30s/it]
Training 1/1 epoch (loss 2.7499): 26%|βββ | 33/125 [00:54<02:06, 1.38s/it]
Training 1/1 epoch (loss 2.8065): 26%|βββ | 33/125 [00:55<02:06, 1.38s/it]
Training 1/1 epoch (loss 2.8065): 27%|βββ | 34/125 [00:55<01:47, 1.18s/it]
Training 1/1 epoch (loss 2.8439): 27%|βββ | 34/125 [00:57<01:47, 1.18s/it]
Training 1/1 epoch (loss 2.8439): 28%|βββ | 35/125 [00:57<01:55, 1.28s/it]
Training 1/1 epoch (loss 2.7450): 28%|βββ | 35/125 [00:58<01:55, 1.28s/it]
Training 1/1 epoch (loss 2.7450): 29%|βββ | 36/125 [00:58<01:57, 1.32s/it]
Training 1/1 epoch (loss 2.8023): 29%|βββ | 36/125 [00:59<01:57, 1.32s/it]
Training 1/1 epoch (loss 2.8023): 30%|βββ | 37/125 [00:59<01:38, 1.12s/it]
Training 1/1 epoch (loss 2.7195): 30%|βββ | 37/125 [01:01<01:38, 1.12s/it]
Training 1/1 epoch (loss 2.7195): 30%|βββ | 38/125 [01:01<02:11, 1.52s/it]
Training 1/1 epoch (loss 2.8728): 30%|βββ | 38/125 [01:04<02:11, 1.52s/it]
Training 1/1 epoch (loss 2.8728): 31%|βββ | 39/125 [01:04<02:32, 1.78s/it]
Training 1/1 epoch (loss 2.6786): 31%|βββ | 39/125 [01:04<02:32, 1.78s/it]
Training 1/1 epoch (loss 2.6786): 32%|ββββ | 40/125 [01:04<02:04, 1.46s/it]
Training 1/1 epoch (loss 2.7781): 32%|ββββ | 40/125 [01:06<02:04, 1.46s/it]
Training 1/1 epoch (loss 2.7781): 33%|ββββ | 41/125 [01:06<01:58, 1.41s/it]
Training 1/1 epoch (loss 2.7660): 33%|ββββ | 41/125 [01:08<01:58, 1.41s/it]
Training 1/1 epoch (loss 2.7660): 34%|ββββ | 42/125 [01:08<02:16, 1.65s/it]
Training 1/1 epoch (loss 2.7675): 34%|ββββ | 42/125 [01:09<02:16, 1.65s/it]
Training 1/1 epoch (loss 2.7675): 34%|ββββ | 43/125 [01:09<01:57, 1.43s/it]
Training 1/1 epoch (loss 2.7924): 34%|ββββ | 43/125 [01:11<01:57, 1.43s/it]
Training 1/1 epoch (loss 2.7924): 35%|ββββ | 44/125 [01:11<02:20, 1.74s/it]
Training 1/1 epoch (loss 2.5972): 35%|ββββ | 44/125 [01:12<02:20, 1.74s/it]
Training 1/1 epoch (loss 2.5972): 36%|ββββ | 45/125 [01:12<02:06, 1.59s/it]
Training 1/1 epoch (loss 2.7160): 36%|ββββ | 45/125 [01:14<02:06, 1.59s/it]
Training 1/1 epoch (loss 2.7160): 37%|ββββ | 46/125 [01:14<01:55, 1.47s/it]
Training 1/1 epoch (loss 2.8437): 37%|ββββ | 46/125 [01:15<01:55, 1.47s/it]
Training 1/1 epoch (loss 2.8437): 38%|ββββ | 47/125 [01:15<02:01, 1.56s/it]
Training 1/1 epoch (loss 2.9046): 38%|ββββ | 47/125 [01:16<02:01, 1.56s/it]
Training 1/1 epoch (loss 2.9046): 38%|ββββ | 48/125 [01:16<01:48, 1.41s/it]
Training 1/1 epoch (loss 2.7710): 38%|ββββ | 48/125 [01:19<01:48, 1.41s/it]
Training 1/1 epoch (loss 2.7710): 39%|ββββ | 49/125 [01:19<02:01, 1.60s/it]
Training 1/1 epoch (loss 2.7955): 39%|ββββ | 49/125 [01:20<02:01, 1.60s/it]
Training 1/1 epoch (loss 2.7955): 40%|ββββ | 50/125 [01:20<02:06, 1.69s/it]
Training 1/1 epoch (loss 2.5596): 40%|ββββ | 50/125 [01:21<02:06, 1.69s/it]
Training 1/1 epoch (loss 2.5596): 41%|ββββ | 51/125 [01:21<01:39, 1.35s/it]
Training 1/1 epoch (loss 2.7775): 41%|ββββ | 51/125 [01:22<01:39, 1.35s/it]
Training 1/1 epoch (loss 2.7775): 42%|βββββ | 52/125 [01:22<01:36, 1.32s/it]
Training 1/1 epoch (loss 2.7066): 42%|βββββ | 52/125 [01:25<01:36, 1.32s/it]
Training 1/1 epoch (loss 2.7066): 42%|βββββ | 53/125 [01:25<01:59, 1.66s/it]
Training 1/1 epoch (loss 2.8211): 42%|βββββ | 53/125 [01:26<01:59, 1.66s/it]
Training 1/1 epoch (loss 2.8211): 43%|βββββ | 54/125 [01:26<01:49, 1.54s/it]
Training 1/1 epoch (loss 2.6722): 43%|βββββ | 54/125 [01:27<01:49, 1.54s/it]
Training 1/1 epoch (loss 2.6722): 44%|βββββ | 55/125 [01:27<01:45, 1.51s/it]
Training 1/1 epoch (loss 2.7128): 44%|βββββ | 55/125 [01:28<01:45, 1.51s/it]
Training 1/1 epoch (loss 2.7128): 45%|βββββ | 56/125 [01:28<01:33, 1.36s/it]
Training 1/1 epoch (loss 2.7848): 45%|βββββ | 56/125 [01:30<01:33, 1.36s/it]
Training 1/1 epoch (loss 2.7848): 46%|βββββ | 57/125 [01:30<01:47, 1.58s/it]
Training 1/1 epoch (loss 2.8204): 46%|βββββ | 57/125 [01:33<01:47, 1.58s/it]
Training 1/1 epoch (loss 2.8204): 46%|βββββ | 58/125 [01:33<02:03, 1.84s/it]
Training 1/1 epoch (loss 2.7354): 46%|βββββ | 58/125 [01:33<02:03, 1.84s/it]
Training 1/1 epoch (loss 2.7354): 47%|βββββ | 59/125 [01:33<01:34, 1.43s/it]
Training 1/1 epoch (loss 2.8442): 47%|βββββ | 59/125 [01:36<01:34, 1.43s/it]
Training 1/1 epoch (loss 2.8442): 48%|βββββ | 60/125 [01:36<01:52, 1.74s/it]
Training 1/1 epoch (loss 2.5441): 48%|βββββ | 60/125 [01:38<01:52, 1.74s/it]
Training 1/1 epoch (loss 2.5441): 49%|βββββ | 61/125 [01:38<02:05, 1.96s/it]
Training 1/1 epoch (loss 2.8434): 49%|βββββ | 61/125 [01:39<02:05, 1.96s/it]
Training 1/1 epoch (loss 2.8434): 50%|βββββ | 62/125 [01:39<01:33, 1.49s/it]
Training 1/1 epoch (loss 2.7909): 50%|βββββ | 62/125 [01:40<01:33, 1.49s/it]
Training 1/1 epoch (loss 2.7909): 50%|βββββ | 63/125 [01:40<01:35, 1.54s/it]
Training 1/1 epoch (loss 2.7202): 50%|βββββ | 63/125 [01:42<01:35, 1.54s/it]
Training 1/1 epoch (loss 2.7202): 51%|βββββ | 64/125 [01:42<01:36, 1.57s/it]
Training 1/1 epoch (loss 2.8127): 51%|βββββ | 64/125 [01:43<01:36, 1.57s/it]
Training 1/1 epoch (loss 2.8127): 52%|ββββββ | 65/125 [01:43<01:29, 1.48s/it]
Training 1/1 epoch (loss 2.6968): 52%|ββββββ | 65/125 [01:45<01:29, 1.48s/it]
Training 1/1 epoch (loss 2.6968): 53%|ββββββ | 66/125 [01:45<01:35, 1.62s/it]
Training 1/1 epoch (loss 2.7461): 53%|ββββββ | 66/125 [01:47<01:35, 1.62s/it]
Training 1/1 epoch (loss 2.7461): 54%|ββββββ | 67/125 [01:47<01:31, 1.58s/it]
Training 1/1 epoch (loss 2.8045): 54%|ββββββ | 67/125 [01:48<01:31, 1.58s/it]
Training 1/1 epoch (loss 2.8045): 54%|ββββββ | 68/125 [01:48<01:29, 1.57s/it]
Training 1/1 epoch (loss 2.8768): 54%|ββββββ | 68/125 [01:50<01:29, 1.57s/it]
Training 1/1 epoch (loss 2.8768): 55%|ββββββ | 69/125 [01:50<01:24, 1.52s/it]
Training 1/1 epoch (loss 2.5949): 55%|ββββββ | 69/125 [01:50<01:24, 1.52s/it]
Training 1/1 epoch (loss 2.5949): 56%|ββββββ | 70/125 [01:50<01:11, 1.30s/it]
Training 1/1 epoch (loss 2.7672): 56%|ββββββ | 70/125 [01:52<01:11, 1.30s/it]
Training 1/1 epoch (loss 2.7672): 57%|ββββββ | 71/125 [01:52<01:21, 1.51s/it]
Training 1/1 epoch (loss 2.7247): 57%|ββββββ | 71/125 [01:55<01:21, 1.51s/it]
Training 1/1 epoch (loss 2.7247): 58%|ββββββ | 72/125 [01:55<01:33, 1.76s/it]
Training 1/1 epoch (loss 2.7020): 58%|ββββββ | 72/125 [01:56<01:33, 1.76s/it]
Training 1/1 epoch (loss 2.7020): 58%|ββββββ | 73/125 [01:56<01:22, 1.59s/it]
Training 1/1 epoch (loss 2.5609): 58%|ββββββ | 73/125 [01:58<01:22, 1.59s/it]
Training 1/1 epoch (loss 2.5609): 59%|ββββββ | 74/125 [01:58<01:24, 1.65s/it]
Training 1/1 epoch (loss 2.7777): 59%|ββββββ | 74/125 [01:59<01:24, 1.65s/it]
Training 1/1 epoch (loss 2.7777): 60%|ββββββ | 75/125 [01:59<01:19, 1.58s/it]
Training 1/1 epoch (loss 2.6129): 60%|ββββββ | 75/125 [02:01<01:19, 1.58s/it]
Training 1/1 epoch (loss 2.6129): 61%|ββββββ | 76/125 [02:01<01:15, 1.54s/it]
Training 1/1 epoch (loss 2.6680): 61%|ββββββ | 76/125 [02:03<01:15, 1.54s/it]
Training 1/1 epoch (loss 2.6680): 62%|βββββββ | 77/125 [02:03<01:26, 1.79s/it]
Training 1/1 epoch (loss 2.6471): 62%|βββββββ | 77/125 [02:04<01:26, 1.79s/it]
Training 1/1 epoch (loss 2.6471): 62%|βββββββ | 78/125 [02:04<01:13, 1.57s/it]
Training 1/1 epoch (loss 2.8182): 62%|βββββββ | 78/125 [02:06<01:13, 1.57s/it]
Training 1/1 epoch (loss 2.8182): 63%|βββββββ | 79/125 [02:06<01:14, 1.61s/it]
Training 1/1 epoch (loss 2.7345): 63%|βββββββ | 79/125 [02:07<01:14, 1.61s/it]
Training 1/1 epoch (loss 2.7345): 64%|βββββββ | 80/125 [02:07<01:09, 1.55s/it]
Training 1/1 epoch (loss 2.8543): 64%|βββββββ | 80/125 [02:08<01:09, 1.55s/it]
Training 1/1 epoch (loss 2.8543): 65%|βββββββ | 81/125 [02:08<00:53, 1.22s/it]
Training 1/1 epoch (loss 2.6498): 65%|βββββββ | 81/125 [02:09<00:53, 1.22s/it]
Training 1/1 epoch (loss 2.6498): 66%|βββββββ | 82/125 [02:09<00:57, 1.34s/it]
Training 1/1 epoch (loss 2.9110): 66%|βββββββ | 82/125 [02:12<00:57, 1.34s/it]
Training 1/1 epoch (loss 2.9110): 66%|βββββββ | 83/125 [02:12<01:08, 1.63s/it]
Training 1/1 epoch (loss 2.6885): 66%|βββββββ | 83/125 [02:12<01:08, 1.63s/it]
Training 1/1 epoch (loss 2.6885): 67%|βββββββ | 84/125 [02:12<00:53, 1.30s/it]
Training 1/1 epoch (loss 2.8714): 67%|βββββββ | 84/125 [02:14<00:53, 1.30s/it]
Training 1/1 epoch (loss 2.8714): 68%|βββββββ | 85/125 [02:14<01:00, 1.52s/it]
Training 1/1 epoch (loss 2.7894): 68%|βββββββ | 85/125 [02:16<01:00, 1.52s/it]
Training 1/1 epoch (loss 2.7894): 69%|βββββββ | 86/125 [02:16<01:02, 1.59s/it]
Training 1/1 epoch (loss 2.8174): 69%|βββββββ | 86/125 [02:17<01:02, 1.59s/it]
Training 1/1 epoch (loss 2.8174): 70%|βββββββ | 87/125 [02:17<00:56, 1.50s/it]
Training 1/1 epoch (loss 2.6429): 70%|βββββββ | 87/125 [02:19<00:56, 1.50s/it]
Training 1/1 epoch (loss 2.6429): 70%|βββββββ | 88/125 [02:19<01:01, 1.67s/it]
Training 1/1 epoch (loss 2.6329): 70%|βββββββ | 88/125 [02:21<01:01, 1.67s/it]
Training 1/1 epoch (loss 2.6329): 71%|βββββββ | 89/125 [02:21<00:58, 1.62s/it]
Training 1/1 epoch (loss 2.7226): 71%|βββββββ | 89/125 [02:22<00:58, 1.62s/it]
Training 1/1 epoch (loss 2.7226): 72%|ββββββββ | 90/125 [02:22<00:50, 1.44s/it]
Training 1/1 epoch (loss 2.9270): 72%|ββββββββ | 90/125 [02:24<00:50, 1.44s/it]
Training 1/1 epoch (loss 2.9270): 73%|ββββββββ | 91/125 [02:24<00:59, 1.74s/it]
Training 1/1 epoch (loss 2.4929): 73%|ββββββββ | 91/125 [02:25<00:59, 1.74s/it]
Training 1/1 epoch (loss 2.4929): 74%|ββββββββ | 92/125 [02:25<00:49, 1.50s/it]
Training 1/1 epoch (loss 2.6303): 74%|ββββββββ | 92/125 [02:27<00:49, 1.50s/it]
Training 1/1 epoch (loss 2.6303): 74%|ββββββββ | 93/125 [02:27<00:46, 1.46s/it]
Training 1/1 epoch (loss 2.5980): 74%|ββββββββ | 93/125 [02:28<00:46, 1.46s/it]
Training 1/1 epoch (loss 2.5980): 75%|ββββββββ | 94/125 [02:28<00:43, 1.39s/it]
Training 1/1 epoch (loss 2.6403): 75%|ββββββββ | 94/125 [02:29<00:43, 1.39s/it]
Training 1/1 epoch (loss 2.6403): 76%|ββββββββ | 95/125 [02:29<00:38, 1.27s/it]
Training 1/1 epoch (loss 2.7753): 76%|ββββββββ | 95/125 [02:30<00:38, 1.27s/it]
Training 1/1 epoch (loss 2.7753): 77%|ββββββββ | 96/125 [02:30<00:39, 1.38s/it]
Training 1/1 epoch (loss 2.6722): 77%|ββββββββ | 96/125 [02:32<00:39, 1.38s/it]
Training 1/1 epoch (loss 2.6722): 78%|ββββββββ | 97/125 [02:32<00:37, 1.33s/it]
Training 1/1 epoch (loss 2.9200): 78%|ββββββββ | 97/125 [02:33<00:37, 1.33s/it]
Training 1/1 epoch (loss 2.9200): 78%|ββββββββ | 98/125 [02:33<00:33, 1.24s/it]
Training 1/1 epoch (loss 2.7404): 78%|ββββββββ | 98/125 [02:34<00:33, 1.24s/it]
Training 1/1 epoch (loss 2.7404): 79%|ββββββββ | 99/125 [02:34<00:32, 1.27s/it]
Training 1/1 epoch (loss 2.6760): 79%|ββββββββ | 99/125 [02:36<00:32, 1.27s/it]
Training 1/1 epoch (loss 2.6760): 80%|ββββββββ | 100/125 [02:36<00:35, 1.44s/it]
Training 1/1 epoch (loss 2.7851): 80%|ββββββββ | 100/125 [02:36<00:35, 1.44s/it]
Training 1/1 epoch (loss 2.7851): 81%|ββββββββ | 101/125 [02:36<00:27, 1.16s/it]
Training 1/1 epoch (loss 2.5611): 81%|ββββββββ | 101/125 [02:38<00:27, 1.16s/it]
Training 1/1 epoch (loss 2.5611): 82%|βββββββββ | 102/125 [02:38<00:31, 1.37s/it]
Training 1/1 epoch (loss 2.6948): 82%|βββββββββ | 102/125 [02:40<00:31, 1.37s/it]
Training 1/1 epoch (loss 2.6948): 82%|βββββββββ | 103/125 [02:40<00:30, 1.39s/it]
Training 1/1 epoch (loss 2.6844): 82%|βββββββββ | 103/125 [02:40<00:30, 1.39s/it]
Training 1/1 epoch (loss 2.6844): 83%|βββββββββ | 104/125 [02:40<00:23, 1.12s/it]
Training 1/1 epoch (loss 2.7754): 83%|βββββββββ | 104/125 [02:42<00:23, 1.12s/it]
Training 1/1 epoch (loss 2.7754): 84%|βββββββββ | 105/125 [02:42<00:26, 1.32s/it]
Training 1/1 epoch (loss 2.6302): 84%|βββββββββ | 105/125 [02:43<00:26, 1.32s/it]
Training 1/1 epoch (loss 2.6302): 85%|βββββββββ | 106/125 [02:43<00:26, 1.40s/it]
Training 1/1 epoch (loss 2.9012): 85%|βββββββββ | 106/125 [02:44<00:26, 1.40s/it]
Training 1/1 epoch (loss 2.9012): 86%|βββββββββ | 107/125 [02:44<00:20, 1.11s/it]
Training 1/1 epoch (loss 2.7371): 86%|βββββββββ | 107/125 [02:45<00:20, 1.11s/it]
Training 1/1 epoch (loss 2.7371): 86%|βββββββββ | 108/125 [02:45<00:20, 1.21s/it]
Training 1/1 epoch (loss 2.6258): 86%|βββββββββ | 108/125 [02:47<00:20, 1.21s/it]
Training 1/1 epoch (loss 2.6258): 87%|βββββββββ | 109/125 [02:47<00:22, 1.44s/it]
Training 1/1 epoch (loss 2.6510): 87%|βββββββββ | 109/125 [02:48<00:22, 1.44s/it]
Training 1/1 epoch (loss 2.6510): 88%|βββββββββ | 110/125 [02:48<00:16, 1.13s/it]
Training 1/1 epoch (loss 2.7389): 88%|βββββββββ | 110/125 [02:50<00:16, 1.13s/it]
Training 1/1 epoch (loss 2.7389): 89%|βββββββββ | 111/125 [02:50<00:21, 1.52s/it]
Training 1/1 epoch (loss 2.6222): 89%|βββββββββ | 111/125 [02:52<00:21, 1.52s/it]
Training 1/1 epoch (loss 2.6222): 90%|βββββββββ | 112/125 [02:52<00:21, 1.66s/it]
Training 1/1 epoch (loss 2.5557): 90%|βββββββββ | 112/125 [02:53<00:21, 1.66s/it]
Training 1/1 epoch (loss 2.5557): 90%|βββββββββ | 113/125 [02:53<00:16, 1.40s/it]
Training 1/1 epoch (loss 2.5846): 90%|βββββββββ | 113/125 [02:54<00:16, 1.40s/it]
Training 1/1 epoch (loss 2.5846): 91%|βββββββββ | 114/125 [02:54<00:15, 1.40s/it]
Training 1/1 epoch (loss 2.8295): 91%|βββββββββ | 114/125 [02:55<00:15, 1.40s/it]
Training 1/1 epoch (loss 2.8295): 92%|ββββββββββ| 115/125 [02:55<00:13, 1.32s/it]
Training 1/1 epoch (loss 2.8226): 92%|ββββββββββ| 115/125 [02:57<00:13, 1.32s/it]
Training 1/1 epoch (loss 2.8226): 93%|ββββββββββ| 116/125 [02:57<00:12, 1.34s/it]
Training 1/1 epoch (loss 2.7766): 93%|ββββββββββ| 116/125 [02:59<00:12, 1.34s/it]
Training 1/1 epoch (loss 2.7766): 94%|ββββββββββ| 117/125 [02:59<00:12, 1.54s/it]
Training 1/1 epoch (loss 2.6592): 94%|ββββββββββ| 117/125 [03:00<00:12, 1.54s/it]
Training 1/1 epoch (loss 2.6592): 94%|ββββββββββ| 118/125 [03:00<00:09, 1.41s/it]
Training 1/1 epoch (loss 2.5984): 94%|ββββββββββ| 118/125 [03:01<00:09, 1.41s/it]
Training 1/1 epoch (loss 2.5984): 95%|ββββββββββ| 119/125 [03:01<00:08, 1.44s/it]
Training 1/1 epoch (loss 2.5679): 95%|ββββββββββ| 119/125 [03:04<00:08, 1.44s/it]
Training 1/1 epoch (loss 2.5679): 96%|ββββββββββ| 120/125 [03:04<00:08, 1.65s/it]
Training 1/1 epoch (loss 2.6356): 96%|ββββββββββ| 120/125 [03:04<00:08, 1.65s/it]
Training 1/1 epoch (loss 2.6356): 97%|ββββββββββ| 121/125 [03:04<00:05, 1.35s/it]
Training 1/1 epoch (loss 2.6851): 97%|ββββββββββ| 121/125 [03:06<00:05, 1.35s/it]
Training 1/1 epoch (loss 2.6851): 98%|ββββββββββ| 122/125 [03:06<00:04, 1.46s/it]
Training 1/1 epoch (loss 2.7696): 98%|ββββββββββ| 122/125 [03:07<00:04, 1.46s/it]
Training 1/1 epoch (loss 2.7696): 98%|ββββββββββ| 123/125 [03:07<00:02, 1.44s/it]
Training 1/1 epoch (loss 2.5854): 98%|ββββββββββ| 123/125 [03:08<00:02, 1.44s/it]
Training 1/1 epoch (loss 2.5854): 99%|ββββββββββ| 124/125 [03:08<00:01, 1.34s/it]
Training 1/1 epoch (loss 2.9643): 99%|ββββββββββ| 124/125 [03:11<00:01, 1.34s/it]
Training 1/1 epoch (loss 2.9643): 100%|ββββββββββ| 125/125 [03:11<00:00, 1.63s/it]
Training 1/1 epoch (loss 2.9643): 100%|ββββββββββ| 125/125 [03:11<00:00, 1.53s/it] |
| tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/tokenizer_config.json |
| Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-2T/tinyllama-2T-s3-Q1-1000/special_tokens_map.json |
| wandb: ERROR Problem finishing run |
| Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1550cc187d90>> |
| Traceback (most recent call last): |
| File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
| return func(*args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^ |
| File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
| self.wandb.finish() |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
| return self._finish(exit_code) |
| ^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
| self._atexit_cleanup(exit_code=exit_code) |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
| self._on_finish() |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
| wait_with_progress( |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
| return wait_all_with_progress( |
| ^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
| return asyncio_compat.run(progress_loop_with_timeout) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
| future = executor.submit(runner.run, fn) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
| raise RuntimeError( |
| RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|