| + deepspeed --master_port 34241 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/pos/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000 --log_type wandb --log_run_name imdb-tinyllama-3T-s3-Q1-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| [rank1]:[W527 19:59:20.044357123 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank7]:[W527 19:59:20.178839810 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank4]:[W527 19:59:20.198051898 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank2]:[W527 19:59:20.199119146 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank5]:[W527 19:59:20.231392725 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank6]:[W527 19:59:20.239851705 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank3]:[W527 19:59:20.242539497 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank0]:[W527 19:59:20.246724738 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/config.json |
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| Model config LlamaConfig { |
| "architectures": [ |
| "LlamaForCausalLM" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "head_dim": 64, |
| "hidden_act": "silu", |
| "hidden_size": 2048, |
| "initializer_range": 0.02, |
| "intermediate_size": 5632, |
| "max_position_embeddings": 2048, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 4, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": null, |
| "rope_theta": 10000.0, |
| "tie_word_embeddings": false, |
| "torch_dtype": "float32", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "vocab_size": 32000 |
| } |
|
|
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| Will use torch_dtype=torch.float32 as defined in model |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| loading weights file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/model.safetensors |
| Will use torch_dtype=torch.float32 as defined in model |
| Instantiating LlamaForCausalLM model under default dtype torch.float32. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2 |
| } |
|
|
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.model |
| loading file chat_template.jinja |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.model |
| loading file chat_template.jinja |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.model |
| loading file chat_template.jinja |
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file tokenizer.json |
| loading file chat_template.jinja |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
| All the weights of LlamaForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
| loading configuration file /aifs4su/hansirui_1st/models/TinyLlama-1.1B-intermediate-step-1431k-3T/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 1, |
| "eos_token_id": 2, |
| "max_length": 2048, |
| "pad_token_id": 0 |
| } |
|
|
| loading file tokenizer.model |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new embeddings will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Detected CUDA files, patching ldflags |
| Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
| /aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
| If this is not desired, please set os.environ[ |
| warnings.warn( |
| Building extension module fused_adam... |
| Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
| wandb: Tracking run with wandb version 0.19.11 |
| wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/wandb/run-20250527_195936-hlsld2kn |
| wandb: Run `wandb offline` to turn off syncing. |
| wandb: Syncing run imdb-tinyllama-3T-s3-Q1-2000 |
| wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
| wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/hlsld2kn |
|
Training 1/1 epoch: 0%| | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 2.7140): 0%| | 0/250 [00:10<?, ?it/s]
Training 1/1 epoch (loss 2.7140): 0%| | 1/250 [00:10<42:51, 10.33s/it]
Training 1/1 epoch (loss 2.7432): 0%| | 1/250 [00:12<42:51, 10.33s/it]
Training 1/1 epoch (loss 2.7432): 1%| | 2/250 [00:12<22:37, 5.47s/it]
Training 1/1 epoch (loss 2.5811): 1%| | 2/250 [00:13<22:37, 5.47s/it]
Training 1/1 epoch (loss 2.5811): 1%| | 3/250 [00:13<14:41, 3.57s/it]
Training 1/1 epoch (loss 2.8143): 1%| | 3/250 [00:15<14:41, 3.57s/it]
Training 1/1 epoch (loss 2.8143): 2%|β | 4/250 [00:15<11:31, 2.81s/it]
Training 1/1 epoch (loss 2.5584): 2%|β | 4/250 [00:17<11:31, 2.81s/it]
Training 1/1 epoch (loss 2.5584): 2%|β | 5/250 [00:17<10:56, 2.68s/it]
Training 1/1 epoch (loss 2.6893): 2%|β | 5/250 [00:18<10:56, 2.68s/it]
Training 1/1 epoch (loss 2.6893): 2%|β | 6/250 [00:18<08:11, 2.01s/it]
Training 1/1 epoch (loss 2.9901): 2%|β | 6/250 [00:20<08:11, 2.01s/it]
Training 1/1 epoch (loss 2.9901): 3%|β | 7/250 [00:20<08:25, 2.08s/it]
Training 1/1 epoch (loss 2.7268): 3%|β | 7/250 [00:22<08:25, 2.08s/it]
Training 1/1 epoch (loss 2.7268): 3%|β | 8/250 [00:22<08:15, 2.05s/it]
Training 1/1 epoch (loss 2.5787): 3%|β | 8/250 [00:23<08:15, 2.05s/it]
Training 1/1 epoch (loss 2.5787): 4%|β | 9/250 [00:23<07:04, 1.76s/it]
Training 1/1 epoch (loss 2.4191): 4%|β | 9/250 [00:26<07:04, 1.76s/it]
Training 1/1 epoch (loss 2.4191): 4%|β | 10/250 [00:26<07:50, 1.96s/it]
Training 1/1 epoch (loss 2.5374): 4%|β | 10/250 [00:28<07:50, 1.96s/it]
Training 1/1 epoch (loss 2.5374): 4%|β | 11/250 [00:28<07:33, 1.90s/it]
Training 1/1 epoch (loss 2.7325): 4%|β | 11/250 [00:28<07:33, 1.90s/it]
Training 1/1 epoch (loss 2.7325): 5%|β | 12/250 [00:28<06:00, 1.51s/it]
Training 1/1 epoch (loss 2.7436): 5%|β | 12/250 [00:30<06:00, 1.51s/it]
Training 1/1 epoch (loss 2.7436): 5%|β | 13/250 [00:30<06:09, 1.56s/it]
Training 1/1 epoch (loss 2.8425): 5%|β | 13/250 [00:32<06:09, 1.56s/it]
Training 1/1 epoch (loss 2.8425): 6%|β | 14/250 [00:32<06:50, 1.74s/it]
Training 1/1 epoch (loss 2.8144): 6%|β | 14/250 [00:33<06:50, 1.74s/it]
Training 1/1 epoch (loss 2.8144): 6%|β | 15/250 [00:33<05:39, 1.45s/it]
Training 1/1 epoch (loss 2.6527): 6%|β | 15/250 [00:35<05:39, 1.45s/it]
Training 1/1 epoch (loss 2.6527): 6%|β | 16/250 [00:35<06:37, 1.70s/it]
Training 1/1 epoch (loss 2.7116): 6%|β | 16/250 [00:36<06:37, 1.70s/it]
Training 1/1 epoch (loss 2.7116): 7%|β | 17/250 [00:36<05:43, 1.47s/it]
Training 1/1 epoch (loss 2.4903): 7%|β | 17/250 [00:37<05:43, 1.47s/it]
Training 1/1 epoch (loss 2.4903): 7%|β | 18/250 [00:37<05:40, 1.47s/it]
Training 1/1 epoch (loss 2.7823): 7%|β | 18/250 [00:39<05:40, 1.47s/it]
Training 1/1 epoch (loss 2.7823): 8%|β | 19/250 [00:39<05:48, 1.51s/it]
Training 1/1 epoch (loss 2.7781): 8%|β | 19/250 [00:40<05:48, 1.51s/it]
Training 1/1 epoch (loss 2.7781): 8%|β | 20/250 [00:40<04:45, 1.24s/it]
Training 1/1 epoch (loss 2.8004): 8%|β | 20/250 [00:42<04:45, 1.24s/it]
Training 1/1 epoch (loss 2.8004): 8%|β | 21/250 [00:42<05:46, 1.51s/it]
Training 1/1 epoch (loss 2.6510): 8%|β | 21/250 [00:43<05:46, 1.51s/it]
Training 1/1 epoch (loss 2.6510): 9%|β | 22/250 [00:43<05:50, 1.54s/it]
Training 1/1 epoch (loss 2.7708): 9%|β | 22/250 [00:44<05:50, 1.54s/it]
Training 1/1 epoch (loss 2.7708): 9%|β | 23/250 [00:44<04:48, 1.27s/it]
Training 1/1 epoch (loss 2.7047): 9%|β | 23/250 [00:46<04:48, 1.27s/it]
Training 1/1 epoch (loss 2.7047): 10%|β | 24/250 [00:46<05:47, 1.54s/it]
Training 1/1 epoch (loss 2.7277): 10%|β | 24/250 [00:49<05:47, 1.54s/it]
Training 1/1 epoch (loss 2.7277): 10%|β | 25/250 [00:49<06:48, 1.82s/it]
Training 1/1 epoch (loss 2.5685): 10%|β | 25/250 [00:49<06:48, 1.82s/it]
Training 1/1 epoch (loss 2.5685): 10%|β | 26/250 [00:49<05:14, 1.40s/it]
Training 1/1 epoch (loss 2.6431): 10%|β | 26/250 [00:51<05:14, 1.40s/it]
Training 1/1 epoch (loss 2.6431): 11%|β | 27/250 [00:51<05:38, 1.52s/it]
Training 1/1 epoch (loss 2.5953): 11%|β | 27/250 [00:53<05:38, 1.52s/it]
Training 1/1 epoch (loss 2.5953): 11%|β | 28/250 [00:53<06:02, 1.63s/it]
Training 1/1 epoch (loss 2.6063): 11%|β | 28/250 [00:53<06:02, 1.63s/it]
Training 1/1 epoch (loss 2.6063): 12%|ββ | 29/250 [00:53<04:49, 1.31s/it]
Training 1/1 epoch (loss 2.7434): 12%|ββ | 29/250 [00:55<04:49, 1.31s/it]
Training 1/1 epoch (loss 2.7434): 12%|ββ | 30/250 [00:55<05:10, 1.41s/it]
Training 1/1 epoch (loss 2.5211): 12%|ββ | 30/250 [00:57<05:10, 1.41s/it]
Training 1/1 epoch (loss 2.5211): 12%|ββ | 31/250 [00:57<05:17, 1.45s/it]
Training 1/1 epoch (loss 2.7761): 12%|ββ | 31/250 [00:57<05:17, 1.45s/it]
Training 1/1 epoch (loss 2.7761): 13%|ββ | 32/250 [00:57<04:35, 1.26s/it]
Training 1/1 epoch (loss 2.8714): 13%|ββ | 32/250 [00:59<04:35, 1.26s/it]
Training 1/1 epoch (loss 2.8714): 13%|ββ | 33/250 [00:59<05:27, 1.51s/it]
Training 1/1 epoch (loss 2.6108): 13%|ββ | 33/250 [01:01<05:27, 1.51s/it]
Training 1/1 epoch (loss 2.6108): 14%|ββ | 34/250 [01:01<05:34, 1.55s/it]
Training 1/1 epoch (loss 2.6628): 14%|ββ | 34/250 [01:02<05:34, 1.55s/it]
Training 1/1 epoch (loss 2.6628): 14%|ββ | 35/250 [01:02<05:18, 1.48s/it]
Training 1/1 epoch (loss 2.6933): 14%|ββ | 35/250 [01:04<05:18, 1.48s/it]
Training 1/1 epoch (loss 2.6933): 14%|ββ | 36/250 [01:04<05:03, 1.42s/it]
Training 1/1 epoch (loss 2.5202): 14%|ββ | 36/250 [01:05<05:03, 1.42s/it]
Training 1/1 epoch (loss 2.5202): 15%|ββ | 37/250 [01:05<04:51, 1.37s/it]
Training 1/1 epoch (loss 2.8449): 15%|ββ | 37/250 [01:06<04:51, 1.37s/it]
Training 1/1 epoch (loss 2.8449): 15%|ββ | 38/250 [01:06<04:28, 1.27s/it]
Training 1/1 epoch (loss 2.8772): 15%|ββ | 38/250 [01:08<04:28, 1.27s/it]
Training 1/1 epoch (loss 2.8772): 16%|ββ | 39/250 [01:08<04:50, 1.38s/it]
Training 1/1 epoch (loss 2.8924): 16%|ββ | 39/250 [01:09<04:50, 1.38s/it]
Training 1/1 epoch (loss 2.8924): 16%|ββ | 40/250 [01:09<04:39, 1.33s/it]
Training 1/1 epoch (loss 2.6251): 16%|ββ | 40/250 [01:10<04:39, 1.33s/it]
Training 1/1 epoch (loss 2.6251): 16%|ββ | 41/250 [01:10<04:10, 1.20s/it]
Training 1/1 epoch (loss 2.5311): 16%|ββ | 41/250 [01:12<04:10, 1.20s/it]
Training 1/1 epoch (loss 2.5311): 17%|ββ | 42/250 [01:12<05:19, 1.54s/it]
Training 1/1 epoch (loss 2.6547): 17%|ββ | 42/250 [01:13<05:19, 1.54s/it]
Training 1/1 epoch (loss 2.6547): 17%|ββ | 43/250 [01:13<04:57, 1.44s/it]
Training 1/1 epoch (loss 2.6972): 17%|ββ | 43/250 [01:15<04:57, 1.44s/it]
Training 1/1 epoch (loss 2.6972): 18%|ββ | 44/250 [01:15<04:49, 1.41s/it]
Training 1/1 epoch (loss 2.7219): 18%|ββ | 44/250 [01:17<04:49, 1.41s/it]
Training 1/1 epoch (loss 2.7219): 18%|ββ | 45/250 [01:17<05:49, 1.70s/it]
Training 1/1 epoch (loss 2.7992): 18%|ββ | 45/250 [01:17<05:49, 1.70s/it]
Training 1/1 epoch (loss 2.7992): 18%|ββ | 46/250 [01:17<04:33, 1.34s/it]
Training 1/1 epoch (loss 2.6323): 18%|ββ | 46/250 [01:19<04:33, 1.34s/it]
Training 1/1 epoch (loss 2.6323): 19%|ββ | 47/250 [01:19<05:12, 1.54s/it]
Training 1/1 epoch (loss 2.6437): 19%|ββ | 47/250 [01:21<05:12, 1.54s/it]
Training 1/1 epoch (loss 2.6437): 19%|ββ | 48/250 [01:21<04:55, 1.47s/it]
Training 1/1 epoch (loss 2.4235): 19%|ββ | 48/250 [01:21<04:55, 1.47s/it]
Training 1/1 epoch (loss 2.4235): 20%|ββ | 49/250 [01:21<04:04, 1.21s/it]
Training 1/1 epoch (loss 2.6387): 20%|ββ | 49/250 [01:24<04:04, 1.21s/it]
Training 1/1 epoch (loss 2.6387): 20%|ββ | 50/250 [01:24<05:18, 1.59s/it]
Training 1/1 epoch (loss 2.5657): 20%|ββ | 50/250 [01:25<05:18, 1.59s/it]
Training 1/1 epoch (loss 2.5657): 20%|ββ | 51/250 [01:25<04:54, 1.48s/it]
Training 1/1 epoch (loss 2.5170): 20%|ββ | 51/250 [01:27<04:54, 1.48s/it]
Training 1/1 epoch (loss 2.5170): 21%|ββ | 52/250 [01:27<04:51, 1.47s/it]
Training 1/1 epoch (loss 2.4854): 21%|ββ | 52/250 [01:29<04:51, 1.47s/it]
Training 1/1 epoch (loss 2.4854): 21%|ββ | 53/250 [01:29<05:30, 1.68s/it]
Training 1/1 epoch (loss 2.6094): 21%|ββ | 53/250 [01:29<05:30, 1.68s/it]
Training 1/1 epoch (loss 2.6094): 22%|βββ | 54/250 [01:29<04:33, 1.39s/it]
Training 1/1 epoch (loss 2.6421): 22%|βββ | 54/250 [01:31<04:33, 1.39s/it]
Training 1/1 epoch (loss 2.6421): 22%|βββ | 55/250 [01:31<04:14, 1.31s/it]
Training 1/1 epoch (loss 2.2521): 22%|βββ | 55/250 [01:32<04:14, 1.31s/it]
Training 1/1 epoch (loss 2.2521): 22%|βββ | 56/250 [01:32<04:50, 1.49s/it]
Training 1/1 epoch (loss 2.7046): 22%|βββ | 56/250 [01:33<04:50, 1.49s/it]
Training 1/1 epoch (loss 2.7046): 23%|βββ | 57/250 [01:33<04:01, 1.25s/it]
Training 1/1 epoch (loss 2.4607): 23%|βββ | 57/250 [01:35<04:01, 1.25s/it]
Training 1/1 epoch (loss 2.4607): 23%|βββ | 58/250 [01:35<04:36, 1.44s/it]
Training 1/1 epoch (loss 2.4722): 23%|βββ | 58/250 [01:37<04:36, 1.44s/it]
Training 1/1 epoch (loss 2.4722): 24%|βββ | 59/250 [01:37<05:03, 1.59s/it]
Training 1/1 epoch (loss 2.6135): 24%|βββ | 59/250 [01:38<05:03, 1.59s/it]
Training 1/1 epoch (loss 2.6135): 24%|βββ | 60/250 [01:38<04:19, 1.37s/it]
Training 1/1 epoch (loss 2.5478): 24%|βββ | 60/250 [01:39<04:19, 1.37s/it]
Training 1/1 epoch (loss 2.5478): 24%|βββ | 61/250 [01:39<04:18, 1.37s/it]
Training 1/1 epoch (loss 2.5725): 24%|βββ | 61/250 [01:41<04:18, 1.37s/it]
Training 1/1 epoch (loss 2.5725): 25%|βββ | 62/250 [01:41<04:38, 1.48s/it]
Training 1/1 epoch (loss 2.7240): 25%|βββ | 62/250 [01:42<04:38, 1.48s/it]
Training 1/1 epoch (loss 2.7240): 25%|βββ | 63/250 [01:42<04:25, 1.42s/it]
Training 1/1 epoch (loss 2.3956): 25%|βββ | 63/250 [01:44<04:25, 1.42s/it]
Training 1/1 epoch (loss 2.3956): 26%|βββ | 64/250 [01:44<04:58, 1.61s/it]
Training 1/1 epoch (loss 2.6036): 26%|βββ | 64/250 [01:45<04:58, 1.61s/it]
Training 1/1 epoch (loss 2.6036): 26%|βββ | 65/250 [01:45<04:24, 1.43s/it]
Training 1/1 epoch (loss 2.6755): 26%|βββ | 65/250 [01:47<04:24, 1.43s/it]
Training 1/1 epoch (loss 2.6755): 26%|βββ | 66/250 [01:47<04:37, 1.51s/it]
Training 1/1 epoch (loss 2.4042): 26%|βββ | 66/250 [01:49<04:37, 1.51s/it]
Training 1/1 epoch (loss 2.4042): 27%|βββ | 67/250 [01:49<05:05, 1.67s/it]
Training 1/1 epoch (loss 2.3956): 27%|βββ | 67/250 [01:49<05:05, 1.67s/it]
Training 1/1 epoch (loss 2.3956): 27%|βββ | 68/250 [01:49<03:57, 1.31s/it]
Training 1/1 epoch (loss 2.4592): 27%|βββ | 68/250 [01:51<03:57, 1.31s/it]
Training 1/1 epoch (loss 2.4592): 28%|βββ | 69/250 [01:51<04:32, 1.50s/it]
Training 1/1 epoch (loss 2.7066): 28%|βββ | 69/250 [01:53<04:32, 1.50s/it]
Training 1/1 epoch (loss 2.7066): 28%|βββ | 70/250 [01:53<04:39, 1.55s/it]
Training 1/1 epoch (loss 2.7474): 28%|βββ | 70/250 [01:54<04:39, 1.55s/it]
Training 1/1 epoch (loss 2.7474): 28%|βββ | 71/250 [01:54<03:43, 1.25s/it]
Training 1/1 epoch (loss 2.5380): 28%|βββ | 71/250 [01:56<03:43, 1.25s/it]
Training 1/1 epoch (loss 2.5380): 29%|βββ | 72/250 [01:56<04:59, 1.68s/it]
Training 1/1 epoch (loss 2.5347): 29%|βββ | 72/250 [01:58<04:59, 1.68s/it]
Training 1/1 epoch (loss 2.5347): 29%|βββ | 73/250 [01:58<04:49, 1.64s/it]
Training 1/1 epoch (loss 2.5944): 29%|βββ | 73/250 [01:58<04:49, 1.64s/it]
Training 1/1 epoch (loss 2.5944): 30%|βββ | 74/250 [01:58<03:55, 1.34s/it]
Training 1/1 epoch (loss 2.7976): 30%|βββ | 74/250 [02:00<03:55, 1.34s/it]
Training 1/1 epoch (loss 2.7976): 30%|βββ | 75/250 [02:00<04:22, 1.50s/it]
Training 1/1 epoch (loss 2.5844): 30%|βββ | 75/250 [02:01<04:22, 1.50s/it]
Training 1/1 epoch (loss 2.5844): 30%|βββ | 76/250 [02:01<03:47, 1.31s/it]
Training 1/1 epoch (loss 2.5172): 30%|βββ | 76/250 [02:03<03:47, 1.31s/it]
Training 1/1 epoch (loss 2.5172): 31%|βββ | 77/250 [02:03<04:13, 1.47s/it]
Training 1/1 epoch (loss 2.5826): 31%|βββ | 77/250 [02:05<04:13, 1.47s/it]
Training 1/1 epoch (loss 2.5826): 31%|βββ | 78/250 [02:05<04:13, 1.47s/it]
Training 1/1 epoch (loss 2.7566): 31%|βββ | 78/250 [02:05<04:13, 1.47s/it]
Training 1/1 epoch (loss 2.7566): 32%|ββββ | 79/250 [02:05<03:21, 1.18s/it]
Training 1/1 epoch (loss 2.5557): 32%|ββββ | 79/250 [02:08<03:21, 1.18s/it]
Training 1/1 epoch (loss 2.5557): 32%|ββββ | 80/250 [02:08<04:33, 1.61s/it]
Training 1/1 epoch (loss 2.5989): 32%|ββββ | 80/250 [02:09<04:33, 1.61s/it]
Training 1/1 epoch (loss 2.5989): 32%|ββββ | 81/250 [02:09<04:00, 1.42s/it]
Training 1/1 epoch (loss 2.8803): 32%|ββββ | 81/250 [02:10<04:00, 1.42s/it]
Training 1/1 epoch (loss 2.8803): 33%|ββββ | 82/250 [02:10<03:39, 1.31s/it]
Training 1/1 epoch (loss 2.6705): 33%|ββββ | 82/250 [02:12<03:39, 1.31s/it]
Training 1/1 epoch (loss 2.6705): 33%|ββββ | 83/250 [02:12<04:38, 1.67s/it]
Training 1/1 epoch (loss 2.7704): 33%|ββββ | 83/250 [02:14<04:38, 1.67s/it]
Training 1/1 epoch (loss 2.7704): 34%|ββββ | 84/250 [02:14<04:31, 1.63s/it]
Training 1/1 epoch (loss 2.7493): 34%|ββββ | 84/250 [02:14<04:31, 1.63s/it]
Training 1/1 epoch (loss 2.7493): 34%|ββββ | 85/250 [02:14<03:38, 1.32s/it]
Training 1/1 epoch (loss 2.8178): 34%|ββββ | 85/250 [02:17<03:38, 1.32s/it]
Training 1/1 epoch (loss 2.8178): 34%|ββββ | 86/250 [02:17<04:28, 1.64s/it]
Training 1/1 epoch (loss 2.6570): 34%|ββββ | 86/250 [02:19<04:28, 1.64s/it]
Training 1/1 epoch (loss 2.6570): 35%|ββββ | 87/250 [02:19<04:35, 1.69s/it]
Training 1/1 epoch (loss 2.6727): 35%|ββββ | 87/250 [02:20<04:35, 1.69s/it]
Training 1/1 epoch (loss 2.6727): 35%|ββββ | 88/250 [02:20<04:13, 1.56s/it]
Training 1/1 epoch (loss 2.6378): 35%|ββββ | 88/250 [02:22<04:13, 1.56s/it]
Training 1/1 epoch (loss 2.6378): 36%|ββββ | 89/250 [02:22<04:38, 1.73s/it]
Training 1/1 epoch (loss 2.7698): 36%|ββββ | 89/250 [02:24<04:38, 1.73s/it]
Training 1/1 epoch (loss 2.7698): 36%|ββββ | 90/250 [02:24<04:35, 1.72s/it]
Training 1/1 epoch (loss 2.5534): 36%|ββββ | 90/250 [02:24<04:35, 1.72s/it]
Training 1/1 epoch (loss 2.5534): 36%|ββββ | 91/250 [02:24<03:47, 1.43s/it]
Training 1/1 epoch (loss 2.6137): 36%|ββββ | 91/250 [02:26<03:47, 1.43s/it]
Training 1/1 epoch (loss 2.6137): 37%|ββββ | 92/250 [02:26<04:09, 1.58s/it]
Training 1/1 epoch (loss 2.6812): 37%|ββββ | 92/250 [02:27<04:09, 1.58s/it]
Training 1/1 epoch (loss 2.6812): 37%|ββββ | 93/250 [02:27<03:44, 1.43s/it]
Training 1/1 epoch (loss 2.7015): 37%|ββββ | 93/250 [02:28<03:44, 1.43s/it]
Training 1/1 epoch (loss 2.7015): 38%|ββββ | 94/250 [02:28<03:26, 1.33s/it]
Training 1/1 epoch (loss 2.6323): 38%|ββββ | 94/250 [02:30<03:26, 1.33s/it]
Training 1/1 epoch (loss 2.6323): 38%|ββββ | 95/250 [02:30<03:37, 1.40s/it]
Training 1/1 epoch (loss 2.4961): 38%|ββββ | 95/250 [02:31<03:37, 1.40s/it]
Training 1/1 epoch (loss 2.4961): 38%|ββββ | 96/250 [02:31<03:15, 1.27s/it]
Training 1/1 epoch (loss 2.6730): 38%|ββββ | 96/250 [02:32<03:15, 1.27s/it]
Training 1/1 epoch (loss 2.6730): 39%|ββββ | 97/250 [02:32<02:53, 1.14s/it]
Training 1/1 epoch (loss 2.8578): 39%|ββββ | 97/250 [02:33<02:53, 1.14s/it]
Training 1/1 epoch (loss 2.8578): 39%|ββββ | 98/250 [02:33<03:15, 1.29s/it]
Training 1/1 epoch (loss 2.6677): 39%|ββββ | 98/250 [02:35<03:15, 1.29s/it]
Training 1/1 epoch (loss 2.6677): 40%|ββββ | 99/250 [02:35<03:14, 1.29s/it]
Training 1/1 epoch (loss 2.6300): 40%|ββββ | 99/250 [02:37<03:14, 1.29s/it]
Training 1/1 epoch (loss 2.6300): 40%|ββββ | 100/250 [02:37<03:33, 1.43s/it]
Training 1/1 epoch (loss 2.5145): 40%|ββββ | 100/250 [02:38<03:33, 1.43s/it]
Training 1/1 epoch (loss 2.5145): 40%|ββββ | 101/250 [02:38<03:25, 1.38s/it]
Training 1/1 epoch (loss 2.4667): 40%|ββββ | 101/250 [02:39<03:25, 1.38s/it]
Training 1/1 epoch (loss 2.4667): 41%|ββββ | 102/250 [02:39<03:12, 1.30s/it]
Training 1/1 epoch (loss 2.5857): 41%|ββββ | 102/250 [02:40<03:12, 1.30s/it]
Training 1/1 epoch (loss 2.5857): 41%|ββββ | 103/250 [02:40<03:23, 1.38s/it]
Training 1/1 epoch (loss 2.6705): 41%|ββββ | 103/250 [02:43<03:23, 1.38s/it]
Training 1/1 epoch (loss 2.6705): 42%|βββββ | 104/250 [02:43<04:10, 1.71s/it]
Training 1/1 epoch (loss 2.5706): 42%|βββββ | 104/250 [02:44<04:10, 1.71s/it]
Training 1/1 epoch (loss 2.5706): 42%|βββββ | 105/250 [02:44<03:32, 1.47s/it]
Training 1/1 epoch (loss 2.6806): 42%|βββββ | 105/250 [02:46<03:32, 1.47s/it]
Training 1/1 epoch (loss 2.6806): 42%|βββββ | 106/250 [02:46<03:51, 1.61s/it]
Training 1/1 epoch (loss 2.5688): 42%|βββββ | 106/250 [02:47<03:51, 1.61s/it]
Training 1/1 epoch (loss 2.5688): 43%|βββββ | 107/250 [02:47<03:49, 1.61s/it]
Training 1/1 epoch (loss 2.6287): 43%|βββββ | 107/250 [02:48<03:49, 1.61s/it]
Training 1/1 epoch (loss 2.6287): 43%|βββββ | 108/250 [02:48<03:01, 1.28s/it]
Training 1/1 epoch (loss 2.5800): 43%|βββββ | 108/250 [02:50<03:01, 1.28s/it]
Training 1/1 epoch (loss 2.5800): 44%|βββββ | 109/250 [02:50<03:16, 1.39s/it]
Training 1/1 epoch (loss 2.5329): 44%|βββββ | 109/250 [02:52<03:16, 1.39s/it]
Training 1/1 epoch (loss 2.5329): 44%|βββββ | 110/250 [02:52<03:56, 1.69s/it]
Training 1/1 epoch (loss 2.6475): 44%|βββββ | 110/250 [02:53<03:56, 1.69s/it]
Training 1/1 epoch (loss 2.6475): 44%|βββββ | 111/250 [02:53<03:08, 1.36s/it]
Training 1/1 epoch (loss 2.6304): 44%|βββββ | 111/250 [02:54<03:08, 1.36s/it]
Training 1/1 epoch (loss 2.6304): 45%|βββββ | 112/250 [02:54<03:09, 1.37s/it]
Training 1/1 epoch (loss 2.4549): 45%|βββββ | 112/250 [02:56<03:09, 1.37s/it]
Training 1/1 epoch (loss 2.4549): 45%|βββββ | 113/250 [02:56<03:27, 1.52s/it]
Training 1/1 epoch (loss 2.8886): 45%|βββββ | 113/250 [02:57<03:27, 1.52s/it]
Training 1/1 epoch (loss 2.8886): 46%|βββββ | 114/250 [02:57<03:15, 1.44s/it]
Training 1/1 epoch (loss 2.8244): 46%|βββββ | 114/250 [02:59<03:15, 1.44s/it]
Training 1/1 epoch (loss 2.8244): 46%|βββββ | 115/250 [02:59<03:36, 1.61s/it]
Training 1/1 epoch (loss 2.5388): 46%|βββββ | 115/250 [03:01<03:36, 1.61s/it]
Training 1/1 epoch (loss 2.5388): 46%|βββββ | 116/250 [03:01<03:34, 1.60s/it]
Training 1/1 epoch (loss 2.5393): 46%|βββββ | 116/250 [03:02<03:34, 1.60s/it]
Training 1/1 epoch (loss 2.5393): 47%|βββββ | 117/250 [03:02<03:26, 1.56s/it]
Training 1/1 epoch (loss 2.1419): 47%|βββββ | 117/250 [03:04<03:26, 1.56s/it]
Training 1/1 epoch (loss 2.1419): 47%|βββββ | 118/250 [03:04<03:30, 1.60s/it]
Training 1/1 epoch (loss 2.4982): 47%|βββββ | 118/250 [03:05<03:30, 1.60s/it]
Training 1/1 epoch (loss 2.4982): 48%|βββββ | 119/250 [03:05<03:07, 1.43s/it]
Training 1/1 epoch (loss 2.6737): 48%|βββββ | 119/250 [03:06<03:07, 1.43s/it]
Training 1/1 epoch (loss 2.6737): 48%|βββββ | 120/250 [03:06<02:44, 1.27s/it]
Training 1/1 epoch (loss 2.3958): 48%|βββββ | 120/250 [03:08<02:44, 1.27s/it]
Training 1/1 epoch (loss 2.3958): 48%|βββββ | 121/250 [03:08<03:05, 1.43s/it]
Training 1/1 epoch (loss 2.5445): 48%|βββββ | 121/250 [03:08<03:05, 1.43s/it]
Training 1/1 epoch (loss 2.5445): 49%|βββββ | 122/250 [03:08<02:38, 1.24s/it]
Training 1/1 epoch (loss 2.4963): 49%|βββββ | 122/250 [03:09<02:38, 1.24s/it]
Training 1/1 epoch (loss 2.4963): 49%|βββββ | 123/250 [03:09<02:34, 1.21s/it]
Training 1/1 epoch (loss 2.4932): 49%|βββββ | 123/250 [03:11<02:34, 1.21s/it]
Training 1/1 epoch (loss 2.4932): 50%|βββββ | 124/250 [03:11<02:58, 1.42s/it]
Training 1/1 epoch (loss 2.5843): 50%|βββββ | 124/250 [03:12<02:58, 1.42s/it]
Training 1/1 epoch (loss 2.5843): 50%|βββββ | 125/250 [03:12<02:28, 1.19s/it]
Training 1/1 epoch (loss 2.5197): 50%|βββββ | 125/250 [03:14<02:28, 1.19s/it]
Training 1/1 epoch (loss 2.5197): 50%|βββββ | 126/250 [03:14<02:47, 1.35s/it]
Training 1/1 epoch (loss 2.6048): 50%|βββββ | 126/250 [03:15<02:47, 1.35s/it]
Training 1/1 epoch (loss 2.6048): 51%|βββββ | 127/250 [03:15<02:52, 1.40s/it]
Training 1/1 epoch (loss 2.6385): 51%|βββββ | 127/250 [03:16<02:52, 1.40s/it]
Training 1/1 epoch (loss 2.6385): 51%|βββββ | 128/250 [03:16<02:22, 1.17s/it]
Training 1/1 epoch (loss 2.5326): 51%|βββββ | 128/250 [03:18<02:22, 1.17s/it]
Training 1/1 epoch (loss 2.5326): 52%|ββββββ | 129/250 [03:18<02:45, 1.36s/it]
Training 1/1 epoch (loss 2.9955): 52%|ββββββ | 129/250 [03:20<02:45, 1.36s/it]
Training 1/1 epoch (loss 2.9955): 52%|ββββββ | 130/250 [03:20<03:12, 1.60s/it]
Training 1/1 epoch (loss 2.6754): 52%|ββββββ | 130/250 [03:20<03:12, 1.60s/it]
Training 1/1 epoch (loss 2.6754): 52%|ββββββ | 131/250 [03:20<02:29, 1.26s/it]
Training 1/1 epoch (loss 2.5938): 52%|ββββββ | 131/250 [03:22<02:29, 1.26s/it]
Training 1/1 epoch (loss 2.5938): 53%|ββββββ | 132/250 [03:22<02:30, 1.28s/it]
Training 1/1 epoch (loss 2.5102): 53%|ββββββ | 132/250 [03:24<02:30, 1.28s/it]
Training 1/1 epoch (loss 2.5102): 53%|ββββββ | 133/250 [03:24<02:53, 1.49s/it]
Training 1/1 epoch (loss 2.6974): 53%|ββββββ | 133/250 [03:24<02:53, 1.49s/it]
Training 1/1 epoch (loss 2.6974): 54%|ββββββ | 134/250 [03:24<02:26, 1.27s/it]
Training 1/1 epoch (loss 2.4033): 54%|ββββββ | 134/250 [03:26<02:26, 1.27s/it]
Training 1/1 epoch (loss 2.4033): 54%|ββββββ | 135/250 [03:26<02:45, 1.44s/it]
Training 1/1 epoch (loss 2.7499): 54%|ββββββ | 135/250 [03:27<02:45, 1.44s/it]
Training 1/1 epoch (loss 2.7499): 54%|ββββββ | 136/250 [03:27<02:28, 1.30s/it]
Training 1/1 epoch (loss 2.5497): 54%|ββββββ | 136/250 [03:28<02:28, 1.30s/it]
Training 1/1 epoch (loss 2.5497): 55%|ββββββ | 137/250 [03:28<02:22, 1.26s/it]
Training 1/1 epoch (loss 2.7486): 55%|ββββββ | 137/250 [03:30<02:22, 1.26s/it]
Training 1/1 epoch (loss 2.7486): 55%|ββββββ | 138/250 [03:30<02:28, 1.33s/it]
Training 1/1 epoch (loss 2.4224): 55%|ββββββ | 138/250 [03:31<02:28, 1.33s/it]
Training 1/1 epoch (loss 2.4224): 56%|ββββββ | 139/250 [03:31<02:09, 1.16s/it]
Training 1/1 epoch (loss 2.7704): 56%|ββββββ | 139/250 [03:33<02:09, 1.16s/it]
Training 1/1 epoch (loss 2.7704): 56%|ββββββ | 140/250 [03:33<02:33, 1.39s/it]
Training 1/1 epoch (loss 2.6347): 56%|ββββββ | 140/250 [03:35<02:33, 1.39s/it]
Training 1/1 epoch (loss 2.6347): 56%|ββββββ | 141/250 [03:35<03:03, 1.69s/it]
Training 1/1 epoch (loss 2.7849): 56%|ββββββ | 141/250 [03:35<03:03, 1.69s/it]
Training 1/1 epoch (loss 2.7849): 57%|ββββββ | 142/250 [03:35<02:25, 1.35s/it]
Training 1/1 epoch (loss 2.6010): 57%|ββββββ | 142/250 [03:38<02:25, 1.35s/it]
Training 1/1 epoch (loss 2.6010): 57%|ββββββ | 143/250 [03:38<02:59, 1.68s/it]
Training 1/1 epoch (loss 2.5379): 57%|ββββββ | 143/250 [03:40<02:59, 1.68s/it]
Training 1/1 epoch (loss 2.5379): 58%|ββββββ | 144/250 [03:40<02:57, 1.68s/it]
Training 1/1 epoch (loss 2.5628): 58%|ββββββ | 144/250 [03:41<02:57, 1.68s/it]
Training 1/1 epoch (loss 2.5628): 58%|ββββββ | 145/250 [03:41<02:36, 1.49s/it]
Training 1/1 epoch (loss 2.6099): 58%|ββββββ | 145/250 [03:42<02:36, 1.49s/it]
Training 1/1 epoch (loss 2.6099): 58%|ββββββ | 146/250 [03:42<02:27, 1.42s/it]
Training 1/1 epoch (loss 2.7381): 58%|ββββββ | 146/250 [03:43<02:27, 1.42s/it]
Training 1/1 epoch (loss 2.7381): 59%|ββββββ | 147/250 [03:43<02:16, 1.33s/it]
Training 1/1 epoch (loss 2.6078): 59%|ββββββ | 147/250 [03:44<02:16, 1.33s/it]
Training 1/1 epoch (loss 2.6078): 59%|ββββββ | 148/250 [03:44<01:59, 1.17s/it]
Training 1/1 epoch (loss 2.7085): 59%|ββββββ | 148/250 [03:45<01:59, 1.17s/it]
Training 1/1 epoch (loss 2.7085): 60%|ββββββ | 149/250 [03:45<02:11, 1.30s/it]
Training 1/1 epoch (loss 2.6165): 60%|ββββββ | 149/250 [03:47<02:11, 1.30s/it]
Training 1/1 epoch (loss 2.6165): 60%|ββββββ | 150/250 [03:47<02:22, 1.43s/it]
Training 1/1 epoch (loss 2.6556): 60%|ββββββ | 150/250 [03:48<02:22, 1.43s/it]
Training 1/1 epoch (loss 2.6556): 60%|ββββββ | 151/250 [03:48<02:04, 1.26s/it]
Training 1/1 epoch (loss 2.6922): 60%|ββββββ | 151/250 [03:50<02:04, 1.26s/it]
Training 1/1 epoch (loss 2.6922): 61%|ββββββ | 152/250 [03:50<02:27, 1.50s/it]
Training 1/1 epoch (loss 2.6023): 61%|ββββββ | 152/250 [03:51<02:27, 1.50s/it]
Training 1/1 epoch (loss 2.6023): 61%|ββββββ | 153/250 [03:51<02:10, 1.34s/it]
Training 1/1 epoch (loss 2.6081): 61%|ββββββ | 153/250 [03:53<02:10, 1.34s/it]
Training 1/1 epoch (loss 2.6081): 62%|βββββββ | 154/250 [03:53<02:15, 1.42s/it]
Training 1/1 epoch (loss 2.8189): 62%|βββββββ | 154/250 [03:55<02:15, 1.42s/it]
Training 1/1 epoch (loss 2.8189): 62%|βββββββ | 155/250 [03:55<02:33, 1.62s/it]
Training 1/1 epoch (loss 2.5961): 62%|βββββββ | 155/250 [03:55<02:33, 1.62s/it]
Training 1/1 epoch (loss 2.5961): 62%|βββββββ | 156/250 [03:55<02:04, 1.32s/it]
Training 1/1 epoch (loss 2.6188): 62%|βββββββ | 156/250 [03:57<02:04, 1.32s/it]
Training 1/1 epoch (loss 2.6188): 63%|βββββββ | 157/250 [03:57<02:05, 1.34s/it]
Training 1/1 epoch (loss 2.5889): 63%|βββββββ | 157/250 [03:59<02:05, 1.34s/it]
Training 1/1 epoch (loss 2.5889): 63%|βββββββ | 158/250 [03:59<02:19, 1.51s/it]
Training 1/1 epoch (loss 2.8371): 63%|βββββββ | 158/250 [03:59<02:19, 1.51s/it]
Training 1/1 epoch (loss 2.8371): 64%|βββββββ | 159/250 [03:59<01:50, 1.21s/it]
Training 1/1 epoch (loss 2.3299): 64%|βββββββ | 159/250 [04:02<01:50, 1.21s/it]
Training 1/1 epoch (loss 2.3299): 64%|βββββββ | 160/250 [04:02<02:30, 1.68s/it]
Training 1/1 epoch (loss 2.9596): 64%|βββββββ | 160/250 [04:04<02:30, 1.68s/it]
Training 1/1 epoch (loss 2.9596): 64%|βββββββ | 161/250 [04:04<02:27, 1.66s/it]
Training 1/1 epoch (loss 2.4752): 64%|βββββββ | 161/250 [04:05<02:27, 1.66s/it]
Training 1/1 epoch (loss 2.4752): 65%|βββββββ | 162/250 [04:05<02:08, 1.46s/it]
Training 1/1 epoch (loss 2.7344): 65%|βββββββ | 162/250 [04:06<02:08, 1.46s/it]
Training 1/1 epoch (loss 2.7344): 65%|βββββββ | 163/250 [04:06<02:11, 1.51s/it]
Training 1/1 epoch (loss 2.6541): 65%|βββββββ | 163/250 [04:07<02:11, 1.51s/it]
Training 1/1 epoch (loss 2.6541): 66%|βββββββ | 164/250 [04:07<01:51, 1.29s/it]
Training 1/1 epoch (loss 2.5174): 66%|βββββββ | 164/250 [04:08<01:51, 1.29s/it]
Training 1/1 epoch (loss 2.5174): 66%|βββββββ | 165/250 [04:08<01:39, 1.17s/it]
Training 1/1 epoch (loss 2.4409): 66%|βββββββ | 165/250 [04:10<01:39, 1.17s/it]
Training 1/1 epoch (loss 2.4409): 66%|βββββββ | 166/250 [04:10<01:52, 1.34s/it]
Training 1/1 epoch (loss 2.6239): 66%|βββββββ | 166/250 [04:10<01:52, 1.34s/it]
Training 1/1 epoch (loss 2.6239): 67%|βββββββ | 167/250 [04:10<01:40, 1.21s/it]
Training 1/1 epoch (loss 2.6126): 67%|βββββββ | 167/250 [04:12<01:40, 1.21s/it]
Training 1/1 epoch (loss 2.6126): 67%|βββββββ | 168/250 [04:12<01:37, 1.19s/it]
Training 1/1 epoch (loss 2.7005): 67%|βββββββ | 168/250 [04:13<01:37, 1.19s/it]
Training 1/1 epoch (loss 2.7005): 68%|βββββββ | 169/250 [04:13<01:51, 1.37s/it]
Training 1/1 epoch (loss 2.4695): 68%|βββββββ | 169/250 [04:15<01:51, 1.37s/it]
Training 1/1 epoch (loss 2.4695): 68%|βββββββ | 170/250 [04:15<01:46, 1.33s/it]
Training 1/1 epoch (loss 2.8147): 68%|βββββββ | 170/250 [04:17<01:46, 1.33s/it]
Training 1/1 epoch (loss 2.8147): 68%|βββββββ | 171/250 [04:17<01:59, 1.52s/it]
Training 1/1 epoch (loss 2.5326): 68%|βββββββ | 171/250 [04:19<01:59, 1.52s/it]
Training 1/1 epoch (loss 2.5326): 69%|βββββββ | 172/250 [04:19<02:20, 1.80s/it]
Training 1/1 epoch (loss 2.1973): 69%|βββββββ | 172/250 [04:20<02:20, 1.80s/it]
Training 1/1 epoch (loss 2.1973): 69%|βββββββ | 173/250 [04:20<01:52, 1.45s/it]
Training 1/1 epoch (loss 2.6904): 69%|βββββββ | 173/250 [04:22<01:52, 1.45s/it]
Training 1/1 epoch (loss 2.6904): 70%|βββββββ | 174/250 [04:22<02:01, 1.60s/it]
Training 1/1 epoch (loss 2.6382): 70%|βββββββ | 174/250 [04:23<02:01, 1.60s/it]
Training 1/1 epoch (loss 2.6382): 70%|βββββββ | 175/250 [04:23<01:55, 1.54s/it]
Training 1/1 epoch (loss 2.5221): 70%|βββββββ | 175/250 [04:24<01:55, 1.54s/it]
Training 1/1 epoch (loss 2.5221): 70%|βββββββ | 176/250 [04:24<01:35, 1.29s/it]
Training 1/1 epoch (loss 2.5774): 70%|βββββββ | 176/250 [04:25<01:35, 1.29s/it]
Training 1/1 epoch (loss 2.5774): 71%|βββββββ | 177/250 [04:25<01:36, 1.33s/it]
Training 1/1 epoch (loss 2.4465): 71%|βββββββ | 177/250 [04:27<01:36, 1.33s/it]
Training 1/1 epoch (loss 2.4465): 71%|βββββββ | 178/250 [04:27<01:53, 1.57s/it]
Training 1/1 epoch (loss 2.7695): 71%|βββββββ | 178/250 [04:28<01:53, 1.57s/it]
Training 1/1 epoch (loss 2.7695): 72%|ββββββββ | 179/250 [04:28<01:31, 1.28s/it]
Training 1/1 epoch (loss 2.6824): 72%|ββββββββ | 179/250 [04:30<01:31, 1.28s/it]
Training 1/1 epoch (loss 2.6824): 72%|ββββββββ | 180/250 [04:30<01:40, 1.43s/it]
Training 1/1 epoch (loss 2.4597): 72%|ββββββββ | 180/250 [04:31<01:40, 1.43s/it]
Training 1/1 epoch (loss 2.4597): 72%|ββββββββ | 181/250 [04:31<01:40, 1.46s/it]
Training 1/1 epoch (loss 2.6692): 72%|ββββββββ | 181/250 [04:33<01:40, 1.46s/it]
Training 1/1 epoch (loss 2.6692): 73%|ββββββββ | 182/250 [04:33<01:39, 1.46s/it]
Training 1/1 epoch (loss 2.5706): 73%|ββββββββ | 182/250 [04:35<01:39, 1.46s/it]
Training 1/1 epoch (loss 2.5706): 73%|ββββββββ | 183/250 [04:35<01:44, 1.56s/it]
Training 1/1 epoch (loss 2.5146): 73%|ββββββββ | 183/250 [04:36<01:44, 1.56s/it]
Training 1/1 epoch (loss 2.5146): 74%|ββββββββ | 184/250 [04:36<01:38, 1.49s/it]
Training 1/1 epoch (loss 2.4882): 74%|ββββββββ | 184/250 [04:37<01:38, 1.49s/it]
Training 1/1 epoch (loss 2.4882): 74%|ββββββββ | 185/250 [04:37<01:39, 1.53s/it]
Training 1/1 epoch (loss 2.6350): 74%|ββββββββ | 185/250 [04:39<01:39, 1.53s/it]
Training 1/1 epoch (loss 2.6350): 74%|ββββββββ | 186/250 [04:39<01:40, 1.58s/it]
Training 1/1 epoch (loss 2.4341): 74%|ββββββββ | 186/250 [04:40<01:40, 1.58s/it]
Training 1/1 epoch (loss 2.4341): 75%|ββββββββ | 187/250 [04:40<01:19, 1.27s/it]
Training 1/1 epoch (loss 2.7440): 75%|ββββββββ | 187/250 [04:42<01:19, 1.27s/it]
Training 1/1 epoch (loss 2.7440): 75%|ββββββββ | 188/250 [04:42<01:40, 1.61s/it]
Training 1/1 epoch (loss 2.4891): 75%|ββββββββ | 188/250 [04:43<01:40, 1.61s/it]
Training 1/1 epoch (loss 2.4891): 76%|ββββββββ | 189/250 [04:43<01:31, 1.50s/it]
Training 1/1 epoch (loss 2.5627): 76%|ββββββββ | 189/250 [04:44<01:31, 1.50s/it]
Training 1/1 epoch (loss 2.5627): 76%|ββββββββ | 190/250 [04:44<01:15, 1.25s/it]
Training 1/1 epoch (loss 2.5516): 76%|ββββββββ | 190/250 [04:45<01:15, 1.25s/it]
Training 1/1 epoch (loss 2.5516): 76%|ββββββββ | 191/250 [04:45<01:15, 1.27s/it]
Training 1/1 epoch (loss 2.6078): 76%|ββββββββ | 191/250 [04:47<01:15, 1.27s/it]
Training 1/1 epoch (loss 2.6078): 77%|ββββββββ | 192/250 [04:47<01:14, 1.28s/it]
Training 1/1 epoch (loss 2.7606): 77%|ββββββββ | 192/250 [04:47<01:14, 1.28s/it]
Training 1/1 epoch (loss 2.7606): 77%|ββββββββ | 193/250 [04:47<01:00, 1.07s/it]
Training 1/1 epoch (loss 2.5132): 77%|ββββββββ | 193/250 [04:50<01:00, 1.07s/it]
Training 1/1 epoch (loss 2.5132): 78%|ββββββββ | 194/250 [04:50<01:20, 1.44s/it]
Training 1/1 epoch (loss 2.5615): 78%|ββββββββ | 194/250 [04:51<01:20, 1.44s/it]
Training 1/1 epoch (loss 2.5615): 78%|ββββββββ | 195/250 [04:51<01:23, 1.51s/it]
Training 1/1 epoch (loss 2.5291): 78%|ββββββββ | 195/250 [04:52<01:23, 1.51s/it]
Training 1/1 epoch (loss 2.5291): 78%|ββββββββ | 196/250 [04:52<01:08, 1.27s/it]
Training 1/1 epoch (loss 2.6008): 78%|ββββββββ | 196/250 [04:53<01:08, 1.27s/it]
Training 1/1 epoch (loss 2.6008): 79%|ββββββββ | 197/250 [04:53<01:09, 1.32s/it]
Training 1/1 epoch (loss 2.5120): 79%|ββββββββ | 197/250 [04:55<01:09, 1.32s/it]
Training 1/1 epoch (loss 2.5120): 79%|ββββββββ | 198/250 [04:55<01:08, 1.32s/it]
Training 1/1 epoch (loss 2.8215): 79%|ββββββββ | 198/250 [04:55<01:08, 1.32s/it]
Training 1/1 epoch (loss 2.8215): 80%|ββββββββ | 199/250 [04:55<00:56, 1.11s/it]
Training 1/1 epoch (loss 2.4445): 80%|ββββββββ | 199/250 [04:58<00:56, 1.11s/it]
Training 1/1 epoch (loss 2.4445): 80%|ββββββββ | 200/250 [04:58<01:14, 1.48s/it]
Training 1/1 epoch (loss 2.5856): 80%|ββββββββ | 200/250 [04:59<01:14, 1.48s/it]
Training 1/1 epoch (loss 2.5856): 80%|ββββββββ | 201/250 [04:59<01:03, 1.30s/it]
Training 1/1 epoch (loss 2.7639): 80%|ββββββββ | 201/250 [04:59<01:03, 1.30s/it]
Training 1/1 epoch (loss 2.7639): 81%|ββββββββ | 202/250 [04:59<00:52, 1.09s/it]
Training 1/1 epoch (loss 2.5127): 81%|ββββββββ | 202/250 [05:00<00:52, 1.09s/it]
Training 1/1 epoch (loss 2.5127): 81%|ββββββββ | 203/250 [05:00<00:54, 1.15s/it]
Training 1/1 epoch (loss 2.5907): 81%|ββββββββ | 203/250 [05:01<00:54, 1.15s/it]
Training 1/1 epoch (loss 2.5907): 82%|βββββββββ | 204/250 [05:01<00:44, 1.03it/s]
Training 1/1 epoch (loss 2.6614): 82%|βββββββββ | 204/250 [05:02<00:44, 1.03it/s]
Training 1/1 epoch (loss 2.6614): 82%|βββββββββ | 205/250 [05:02<00:45, 1.01s/it]
Training 1/1 epoch (loss 2.4145): 82%|βββββββββ | 205/250 [05:04<00:45, 1.01s/it]
Training 1/1 epoch (loss 2.4145): 82%|βββββββββ | 206/250 [05:04<00:50, 1.15s/it]
Training 1/1 epoch (loss 2.4548): 82%|βββββββββ | 206/250 [05:05<00:50, 1.15s/it]
Training 1/1 epoch (loss 2.4548): 83%|βββββββββ | 207/250 [05:05<00:54, 1.27s/it]
Training 1/1 epoch (loss 2.6206): 83%|βββββββββ | 207/250 [05:07<00:54, 1.27s/it]
Training 1/1 epoch (loss 2.6206): 83%|βββββββββ | 208/250 [05:07<00:58, 1.40s/it]
Training 1/1 epoch (loss 2.6592): 83%|βββββββββ | 208/250 [05:09<00:58, 1.40s/it]
Training 1/1 epoch (loss 2.6592): 84%|βββββββββ | 209/250 [05:09<01:10, 1.72s/it]
Training 1/1 epoch (loss 2.4459): 84%|βββββββββ | 209/250 [05:10<01:10, 1.72s/it]
Training 1/1 epoch (loss 2.4459): 84%|βββββββββ | 210/250 [05:10<00:57, 1.44s/it]
Training 1/1 epoch (loss 2.4961): 84%|βββββββββ | 210/250 [05:11<00:57, 1.44s/it]
Training 1/1 epoch (loss 2.4961): 84%|βββββββββ | 211/250 [05:11<00:53, 1.37s/it]
Training 1/1 epoch (loss 2.8609): 84%|βββββββββ | 211/250 [05:13<00:53, 1.37s/it]
Training 1/1 epoch (loss 2.8609): 85%|βββββββββ | 212/250 [05:13<01:01, 1.61s/it]
Training 1/1 epoch (loss 2.8591): 85%|βββββββββ | 212/250 [05:14<01:01, 1.61s/it]
Training 1/1 epoch (loss 2.8591): 85%|βββββββββ | 213/250 [05:14<00:51, 1.39s/it]
Training 1/1 epoch (loss 2.7610): 85%|βββββββββ | 213/250 [05:16<00:51, 1.39s/it]
Training 1/1 epoch (loss 2.7610): 86%|βββββββββ | 214/250 [05:16<00:54, 1.50s/it]
Training 1/1 epoch (loss 2.5563): 86%|βββββββββ | 214/250 [05:17<00:54, 1.50s/it]
Training 1/1 epoch (loss 2.5563): 86%|βββββββββ | 215/250 [05:17<00:45, 1.31s/it]
Training 1/1 epoch (loss 2.5365): 86%|βββββββββ | 215/250 [05:18<00:45, 1.31s/it]
Training 1/1 epoch (loss 2.5365): 86%|βββββββββ | 216/250 [05:18<00:44, 1.29s/it]
Training 1/1 epoch (loss 2.6488): 86%|βββββββββ | 216/250 [05:21<00:44, 1.29s/it]
Training 1/1 epoch (loss 2.6488): 87%|βββββββββ | 217/250 [05:21<00:53, 1.62s/it]
Training 1/1 epoch (loss 2.5639): 87%|βββββββββ | 217/250 [05:21<00:53, 1.62s/it]
Training 1/1 epoch (loss 2.5639): 87%|βββββββββ | 218/250 [05:21<00:43, 1.35s/it]
Training 1/1 epoch (loss 2.4405): 87%|βββββββββ | 218/250 [05:24<00:43, 1.35s/it]
Training 1/1 epoch (loss 2.4405): 88%|βββββββββ | 219/250 [05:24<00:49, 1.61s/it]
Training 1/1 epoch (loss 2.4839): 88%|βββββββββ | 219/250 [05:25<00:49, 1.61s/it]
Training 1/1 epoch (loss 2.4839): 88%|βββββββββ | 220/250 [05:25<00:48, 1.61s/it]
Training 1/1 epoch (loss 2.7012): 88%|βββββββββ | 220/250 [05:26<00:48, 1.61s/it]
Training 1/1 epoch (loss 2.7012): 88%|βββββββββ | 221/250 [05:26<00:37, 1.29s/it]
Training 1/1 epoch (loss 2.7838): 88%|βββββββββ | 221/250 [05:28<00:37, 1.29s/it]
Training 1/1 epoch (loss 2.7838): 89%|βββββββββ | 222/250 [05:28<00:40, 1.46s/it]
Training 1/1 epoch (loss 2.5428): 89%|βββββββββ | 222/250 [05:30<00:40, 1.46s/it]
Training 1/1 epoch (loss 2.5428): 89%|βββββββββ | 223/250 [05:30<00:44, 1.63s/it]
Training 1/1 epoch (loss 2.3745): 89%|βββββββββ | 223/250 [05:30<00:44, 1.63s/it]
Training 1/1 epoch (loss 2.3745): 90%|βββββββββ | 224/250 [05:30<00:36, 1.39s/it]
Training 1/1 epoch (loss 2.6662): 90%|βββββββββ | 224/250 [05:32<00:36, 1.39s/it]
Training 1/1 epoch (loss 2.6662): 90%|βββββββββ | 225/250 [05:32<00:38, 1.53s/it]
Training 1/1 epoch (loss 2.8743): 90%|βββββββββ | 225/250 [05:34<00:38, 1.53s/it]
Training 1/1 epoch (loss 2.8743): 90%|βββββββββ | 226/250 [05:34<00:37, 1.55s/it]
Training 1/1 epoch (loss 2.6080): 90%|βββββββββ | 226/250 [05:35<00:37, 1.55s/it]
Training 1/1 epoch (loss 2.6080): 91%|βββββββββ | 227/250 [05:35<00:32, 1.42s/it]
Training 1/1 epoch (loss 2.7184): 91%|βββββββββ | 227/250 [05:37<00:32, 1.42s/it]
Training 1/1 epoch (loss 2.7184): 91%|βββββββββ | 228/250 [05:37<00:33, 1.51s/it]
Training 1/1 epoch (loss 2.5346): 91%|βββββββββ | 228/250 [05:38<00:33, 1.51s/it]
Training 1/1 epoch (loss 2.5346): 92%|ββββββββββ| 229/250 [05:38<00:29, 1.42s/it]
Training 1/1 epoch (loss 2.5707): 92%|ββββββββββ| 229/250 [05:39<00:29, 1.42s/it]
Training 1/1 epoch (loss 2.5707): 92%|ββββββββββ| 230/250 [05:39<00:27, 1.38s/it]
Training 1/1 epoch (loss 2.5076): 92%|ββββββββββ| 230/250 [05:41<00:27, 1.38s/it]
Training 1/1 epoch (loss 2.5076): 92%|ββββββββββ| 231/250 [05:41<00:28, 1.51s/it]
Training 1/1 epoch (loss 2.6488): 92%|ββββββββββ| 231/250 [05:42<00:28, 1.51s/it]
Training 1/1 epoch (loss 2.6488): 93%|ββββββββββ| 232/250 [05:42<00:26, 1.49s/it]
Training 1/1 epoch (loss 2.5836): 93%|ββββββββββ| 232/250 [05:44<00:26, 1.49s/it]
Training 1/1 epoch (loss 2.5836): 93%|ββββββββββ| 233/250 [05:44<00:24, 1.43s/it]
Training 1/1 epoch (loss 2.5472): 93%|ββββββββββ| 233/250 [05:46<00:24, 1.43s/it]
Training 1/1 epoch (loss 2.5472): 94%|ββββββββββ| 234/250 [05:46<00:24, 1.55s/it]
Training 1/1 epoch (loss 2.7359): 94%|ββββββββββ| 234/250 [05:46<00:24, 1.55s/it]
Training 1/1 epoch (loss 2.7359): 94%|ββββββββββ| 235/250 [05:46<00:18, 1.22s/it]
Training 1/1 epoch (loss 2.8293): 94%|ββββββββββ| 235/250 [05:48<00:18, 1.22s/it]
Training 1/1 epoch (loss 2.8293): 94%|ββββββββββ| 236/250 [05:48<00:20, 1.48s/it]
Training 1/1 epoch (loss 2.5657): 94%|ββββββββββ| 236/250 [05:50<00:20, 1.48s/it]
Training 1/1 epoch (loss 2.5657): 95%|ββββββββββ| 237/250 [05:50<00:20, 1.57s/it]
Training 1/1 epoch (loss 2.7450): 95%|ββββββββββ| 237/250 [05:50<00:20, 1.57s/it]
Training 1/1 epoch (loss 2.7450): 95%|ββββββββββ| 238/250 [05:50<00:14, 1.24s/it]
Training 1/1 epoch (loss 2.4366): 95%|ββββββββββ| 238/250 [05:52<00:14, 1.24s/it]
Training 1/1 epoch (loss 2.4366): 96%|ββββββββββ| 239/250 [05:52<00:14, 1.35s/it]
Training 1/1 epoch (loss 2.8389): 96%|ββββββββββ| 239/250 [05:54<00:14, 1.35s/it]
Training 1/1 epoch (loss 2.8389): 96%|ββββββββββ| 240/250 [05:54<00:16, 1.68s/it]
Training 1/1 epoch (loss 2.6822): 96%|ββββββββββ| 240/250 [05:55<00:16, 1.68s/it]
Training 1/1 epoch (loss 2.6822): 96%|ββββββββββ| 241/250 [05:55<00:12, 1.39s/it]
Training 1/1 epoch (loss 2.6848): 96%|ββββββββββ| 241/250 [05:57<00:12, 1.39s/it]
Training 1/1 epoch (loss 2.6848): 97%|ββββββββββ| 242/250 [05:57<00:11, 1.48s/it]
Training 1/1 epoch (loss 2.7384): 97%|ββββββββββ| 242/250 [05:58<00:11, 1.48s/it]
Training 1/1 epoch (loss 2.7384): 97%|ββββββββββ| 243/250 [05:58<00:10, 1.52s/it]
Training 1/1 epoch (loss 2.6422): 97%|ββββββββββ| 243/250 [05:59<00:10, 1.52s/it]
Training 1/1 epoch (loss 2.6422): 98%|ββββββββββ| 244/250 [05:59<00:07, 1.26s/it]
Training 1/1 epoch (loss 2.4698): 98%|ββββββββββ| 244/250 [06:01<00:07, 1.26s/it]
Training 1/1 epoch (loss 2.4698): 98%|ββββββββββ| 245/250 [06:01<00:07, 1.46s/it]
Training 1/1 epoch (loss 2.6581): 98%|ββββββββββ| 245/250 [06:03<00:07, 1.46s/it]
Training 1/1 epoch (loss 2.6581): 98%|ββββββββββ| 246/250 [06:03<00:06, 1.54s/it]
Training 1/1 epoch (loss 2.4902): 98%|ββββββββββ| 246/250 [06:03<00:06, 1.54s/it]
Training 1/1 epoch (loss 2.4902): 99%|ββββββββββ| 247/250 [06:03<00:03, 1.28s/it]
Training 1/1 epoch (loss 2.5026): 99%|ββββββββββ| 247/250 [06:05<00:03, 1.28s/it]
Training 1/1 epoch (loss 2.5026): 99%|ββββββββββ| 248/250 [06:05<00:02, 1.41s/it]
Training 1/1 epoch (loss 2.5800): 99%|ββββββββββ| 248/250 [06:07<00:02, 1.41s/it]
Training 1/1 epoch (loss 2.5800): 100%|ββββββββββ| 249/250 [06:07<00:01, 1.48s/it]
Training 1/1 epoch (loss 2.5474): 100%|ββββββββββ| 249/250 [06:08<00:01, 1.48s/it]
Training 1/1 epoch (loss 2.5474): 100%|ββββββββββ| 250/250 [06:08<00:00, 1.51s/it]
Training 1/1 epoch (loss 2.5474): 100%|ββββββββββ| 250/250 [06:08<00:00, 1.48s/it] |
| tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/tokenizer_config.json |
| Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/tinyllama-3T/tinyllama-3T-s3-Q1-2000/special_tokens_map.json |
| wandb: ERROR Problem finishing run |
| Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x15512c3be990>> |
| Traceback (most recent call last): |
| File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
| return func(*args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^ |
| File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
| self.wandb.finish() |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
| return self._finish(exit_code) |
| ^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
| self._atexit_cleanup(exit_code=exit_code) |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
| self._on_finish() |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
| wait_with_progress( |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
| return wait_all_with_progress( |
| ^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
| return asyncio_compat.run(progress_loop_with_timeout) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
| future = executor.submit(runner.run, fn) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
| raise RuntimeError( |
| RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|