| + deepspeed --master_port 33524 --module safe_rlhf.finetune --train_datasets inverse-json::/home/hansirui_1st/jiayi/resist/imdb_data/train/pos/2000/train.json --model_name_or_path /aifs4su/hansirui_1st/models/Qwen1.5-4B --max_length 512 --trust_remote_code True --epochs 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 4 --gradient_accumulation_steps 8 --gradient_checkpointing --learning_rate 1e-5 --lr_warmup_ratio 0 --weight_decay 0.0 --lr_scheduler_type constant --weight_decay 0.0 --seed 42 --output_dir /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-2000 --log_type wandb --log_run_name imdb-Qwen1.5-4B-s3-Q1-2000 --log_project Inverse_Alignment_IMDb --zero_stage 3 --offload none --bf16 True --tf32 True --save_16bit |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| nvcc warning : incompatible redefinition for option |
| [rank7]:[W526 16:16:31.157609967 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank2]:[W526 16:16:31.164863645 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank0]:[W526 16:16:31.168072499 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank4]:[W526 16:16:31.177234309 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank1]:[W526 16:16:31.177505977 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank5]:[W526 16:16:31.182597534 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank3]:[W526 16:16:31.184980963 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| [rank6]:[W526 16:16:31.185042441 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/config.json |
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| Model config Qwen2Config { |
| "architectures": [ |
| "Qwen2ForCausalLM" |
| ], |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "hidden_act": "silu", |
| "hidden_size": 2560, |
| "initializer_range": 0.02, |
| "intermediate_size": 6912, |
| "max_position_embeddings": 32768, |
| "max_window_layers": 21, |
| "model_type": "qwen2", |
| "num_attention_heads": 20, |
| "num_hidden_layers": 40, |
| "num_key_value_heads": 20, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": null, |
| "rope_theta": 5000000.0, |
| "sliding_window": 32768, |
| "tie_word_embeddings": false, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.1", |
| "use_cache": true, |
| "use_sliding_window": false, |
| "vocab_size": 151936 |
| } |
|
|
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
| loading weights file /aifs4su/hansirui_1st/models/Qwen1.5-4B/model.safetensors.index.json |
| Will use torch_dtype=torch.bfloat16 as defined in model |
| Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16. |
| Detected DeepSpeed ZeRO-3: activating zero.init() for this model |
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643 |
| } |
|
|
|
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.24s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.25s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.23s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.24s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.23s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.23s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.23s/it]
Loading checkpoint shards: 50%|βββββ | 1/2 [00:04<00:04, 4.52s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.66s/it]All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it]All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
|
|
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.65s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it] |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:07<00:00, 3.74s/it] |
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| loading file vocab.json |
| loading file merges.txt |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file vocab.json |
| loading file merges.txt |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file vocab.json |
| loading file vocab.json |
| loading file merges.txt |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file merges.txt |
| loading file special_tokens_map.json |
| loading file vocab.json |
| loading file tokenizer_config.json |
| loading file tokenizer.json |
| loading file chat_template.jinja |
| loading file added_tokens.json |
| loading file merges.txt |
| loading file special_tokens_map.json |
| loading file tokenizer.json |
| loading file tokenizer_config.json |
| loading file added_tokens.json |
| loading file chat_template.jinja |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file vocab.json |
| loading file chat_template.jinja |
| loading file merges.txt |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| loading file vocab.json |
| loading file merges.txt |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
|
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:08<00:00, 4.04s/it]
Loading checkpoint shards: 100%|ββββββββββ| 2/2 [00:08<00:00, 4.11s/it] |
| All model checkpoint weights were used when initializing Qwen2ForCausalLM. |
|
|
| All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at /aifs4su/hansirui_1st/models/Qwen1.5-4B. |
| If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training. |
| loading configuration file /aifs4su/hansirui_1st/models/Qwen1.5-4B/generation_config.json |
| Generate config GenerationConfig { |
| "bos_token_id": 151643, |
| "eos_token_id": 151643, |
| "max_new_tokens": 2048 |
| } |
|
|
| loading file vocab.json |
| loading file merges.txt |
| loading file tokenizer.json |
| loading file added_tokens.json |
| loading file special_tokens_map.json |
| loading file tokenizer_config.json |
| loading file chat_template.jinja |
| Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
| /home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/models/pretrained.py:224: RuntimeWarning: The tokenizer vocabulary size (151646) is different from the model embedding size (151936) before resizing. |
| resize_tokenizer_embedding(tokenizer=tokenizer, model=model) |
| You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 151646. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| Detected CUDA files, patching ldflags |
| Emitting ninja build file /home/hansirui_1st/.cache/torch_extensions/py311_cu124/fused_adam/build.ninja... |
| Using /home/hansirui_1st/.cache/torch_extensions/py311_cu124 as PyTorch extensions root... |
| /aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. |
| If this is not desired, please set os.environ[ |
| warnings.warn( |
| Building extension module fused_adam... |
| Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| Loading extension module fused_adam... |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
| wandb: Currently logged in as: xtom to https://api.wandb.ai. Use `wandb login --relogin` to force relogin |
| wandb: Tracking run with wandb version 0.19.11 |
| wandb: Run data is saved locally in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-2000/wandb/run-20250526_161653-kcrxv7zx |
| wandb: Run `wandb offline` to turn off syncing. |
| wandb: Syncing run imdb-Qwen1.5-4B-s3-Q1-2000 |
| wandb: βοΈ View project at https://wandb.ai/xtom/Inverse_Alignment_IMDb |
| wandb: π View run at https://wandb.ai/xtom/Inverse_Alignment_IMDb/runs/kcrxv7zx |
|
Training 1/1 epoch: 0%| | 0/250 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`. |
|
Training 1/1 epoch (loss 2.9833): 0%| | 0/250 [00:14<?, ?it/s]
Training 1/1 epoch (loss 2.9833): 0%| | 1/250 [00:14<1:01:02, 14.71s/it]
Training 1/1 epoch (loss 3.0034): 0%| | 1/250 [00:18<1:01:02, 14.71s/it]
Training 1/1 epoch (loss 3.0034): 1%| | 2/250 [00:18<33:41, 8.15s/it]
Training 1/1 epoch (loss 2.8830): 1%| | 2/250 [00:19<33:41, 8.15s/it]
Training 1/1 epoch (loss 2.8830): 1%| | 3/250 [00:19<20:51, 5.07s/it]
Training 1/1 epoch (loss 3.0851): 1%| | 3/250 [00:21<20:51, 5.07s/it]
Training 1/1 epoch (loss 3.0851): 2%|β | 4/250 [00:21<14:44, 3.60s/it]
Training 1/1 epoch (loss 2.8227): 2%|β | 4/250 [00:23<14:44, 3.60s/it]
Training 1/1 epoch (loss 2.8227): 2%|β | 5/250 [00:23<12:33, 3.08s/it]
Training 1/1 epoch (loss 3.0265): 2%|β | 5/250 [00:25<12:33, 3.08s/it]
Training 1/1 epoch (loss 3.0265): 2%|β | 6/250 [00:25<12:00, 2.95s/it]
Training 1/1 epoch (loss 3.2037): 2%|β | 6/250 [00:28<12:00, 2.95s/it]
Training 1/1 epoch (loss 3.2037): 3%|β | 7/250 [00:28<11:58, 2.95s/it]
Training 1/1 epoch (loss 2.9890): 3%|β | 7/250 [00:31<11:58, 2.95s/it]
Training 1/1 epoch (loss 2.9890): 3%|β | 8/250 [00:31<11:57, 2.96s/it]
Training 1/1 epoch (loss 2.7697): 3%|β | 8/250 [00:34<11:57, 2.96s/it]
Training 1/1 epoch (loss 2.7697): 4%|β | 9/250 [00:34<11:20, 2.82s/it]
Training 1/1 epoch (loss 2.6256): 4%|β | 9/250 [00:37<11:20, 2.82s/it]
Training 1/1 epoch (loss 2.6256): 4%|β | 10/250 [00:37<11:12, 2.80s/it]
Training 1/1 epoch (loss 2.6946): 4%|β | 10/250 [00:38<11:12, 2.80s/it]
Training 1/1 epoch (loss 2.6946): 4%|β | 11/250 [00:38<09:51, 2.47s/it]
Training 1/1 epoch (loss 2.9840): 4%|β | 11/250 [00:41<09:51, 2.47s/it]
Training 1/1 epoch (loss 2.9840): 5%|β | 12/250 [00:41<09:57, 2.51s/it]
Training 1/1 epoch (loss 2.9744): 5%|β | 12/250 [00:42<09:57, 2.51s/it]
Training 1/1 epoch (loss 2.9744): 5%|β | 13/250 [00:42<08:38, 2.19s/it]
Training 1/1 epoch (loss 3.1352): 5%|β | 13/250 [00:44<08:38, 2.19s/it]
Training 1/1 epoch (loss 3.1352): 6%|β | 14/250 [00:44<08:07, 2.07s/it]
Training 1/1 epoch (loss 2.9357): 6%|β | 14/250 [00:46<08:07, 2.07s/it]
Training 1/1 epoch (loss 2.9357): 6%|β | 15/250 [00:46<08:25, 2.15s/it]
Training 1/1 epoch (loss 2.8097): 6%|β | 15/250 [00:49<08:25, 2.15s/it]
Training 1/1 epoch (loss 2.8097): 6%|β | 16/250 [00:49<09:16, 2.38s/it]
Training 1/1 epoch (loss 2.9009): 6%|β | 16/250 [00:53<09:16, 2.38s/it]
Training 1/1 epoch (loss 2.9009): 7%|β | 17/250 [00:53<10:07, 2.61s/it]
Training 1/1 epoch (loss 2.7420): 7%|β | 17/250 [00:55<10:07, 2.61s/it]
Training 1/1 epoch (loss 2.7420): 7%|β | 18/250 [00:55<09:40, 2.50s/it]
Training 1/1 epoch (loss 2.9758): 7%|β | 18/250 [00:58<09:40, 2.50s/it]
Training 1/1 epoch (loss 2.9758): 8%|β | 19/250 [00:58<10:31, 2.74s/it]
Training 1/1 epoch (loss 3.0459): 8%|β | 19/250 [01:00<10:31, 2.74s/it]
Training 1/1 epoch (loss 3.0459): 8%|β | 20/250 [01:00<09:26, 2.46s/it]
Training 1/1 epoch (loss 3.0032): 8%|β | 20/250 [01:03<09:26, 2.46s/it]
Training 1/1 epoch (loss 3.0032): 8%|β | 21/250 [01:03<10:27, 2.74s/it]
Training 1/1 epoch (loss 2.7597): 8%|β | 21/250 [01:05<10:27, 2.74s/it]
Training 1/1 epoch (loss 2.7597): 9%|β | 22/250 [01:05<09:45, 2.57s/it]
Training 1/1 epoch (loss 3.0260): 9%|β | 22/250 [01:08<09:45, 2.57s/it]
Training 1/1 epoch (loss 3.0260): 9%|β | 23/250 [01:08<09:57, 2.63s/it]
Training 1/1 epoch (loss 2.8865): 9%|β | 23/250 [01:10<09:57, 2.63s/it]
Training 1/1 epoch (loss 2.8865): 10%|β | 24/250 [01:10<09:13, 2.45s/it]
Training 1/1 epoch (loss 2.9888): 10%|β | 24/250 [01:16<09:13, 2.45s/it]
Training 1/1 epoch (loss 2.9888): 10%|β | 25/250 [01:16<12:35, 3.36s/it]
Training 1/1 epoch (loss 2.6792): 10%|β | 25/250 [01:18<12:35, 3.36s/it]
Training 1/1 epoch (loss 2.6792): 10%|β | 26/250 [01:18<11:19, 3.04s/it]
Training 1/1 epoch (loss 2.8288): 10%|β | 26/250 [01:19<11:19, 3.04s/it]
Training 1/1 epoch (loss 2.8288): 11%|β | 27/250 [01:19<09:24, 2.53s/it]
Training 1/1 epoch (loss 2.6760): 11%|β | 27/250 [01:21<09:24, 2.53s/it]
Training 1/1 epoch (loss 2.6760): 11%|β | 28/250 [01:21<08:49, 2.38s/it]
Training 1/1 epoch (loss 2.7962): 11%|β | 28/250 [01:25<08:49, 2.38s/it]
Training 1/1 epoch (loss 2.7962): 12%|ββ | 29/250 [01:25<10:19, 2.80s/it]
Training 1/1 epoch (loss 2.9788): 12%|ββ | 29/250 [01:27<10:19, 2.80s/it]
Training 1/1 epoch (loss 2.9788): 12%|ββ | 30/250 [01:27<09:14, 2.52s/it]
Training 1/1 epoch (loss 2.7621): 12%|ββ | 30/250 [01:29<09:14, 2.52s/it]
Training 1/1 epoch (loss 2.7621): 12%|ββ | 31/250 [01:29<08:48, 2.41s/it]
Training 1/1 epoch (loss 3.0119): 12%|ββ | 31/250 [01:33<08:48, 2.41s/it]
Training 1/1 epoch (loss 3.0119): 13%|ββ | 32/250 [01:33<09:50, 2.71s/it]
Training 1/1 epoch (loss 3.1502): 13%|ββ | 32/250 [01:34<09:50, 2.71s/it]
Training 1/1 epoch (loss 3.1502): 13%|ββ | 33/250 [01:34<08:34, 2.37s/it]
Training 1/1 epoch (loss 2.8588): 13%|ββ | 33/250 [01:38<08:34, 2.37s/it]
Training 1/1 epoch (loss 2.8588): 14%|ββ | 34/250 [01:38<09:39, 2.68s/it]
Training 1/1 epoch (loss 2.8507): 14%|ββ | 34/250 [01:40<09:39, 2.68s/it]
Training 1/1 epoch (loss 2.8507): 14%|ββ | 35/250 [01:40<09:08, 2.55s/it]
Training 1/1 epoch (loss 2.9073): 14%|ββ | 35/250 [01:42<09:08, 2.55s/it]
Training 1/1 epoch (loss 2.9073): 14%|ββ | 36/250 [01:42<08:36, 2.41s/it]
Training 1/1 epoch (loss 2.7446): 14%|ββ | 36/250 [01:44<08:36, 2.41s/it]
Training 1/1 epoch (loss 2.7446): 15%|ββ | 37/250 [01:44<08:38, 2.44s/it]
Training 1/1 epoch (loss 3.1559): 15%|ββ | 37/250 [01:47<08:38, 2.44s/it]
Training 1/1 epoch (loss 3.1559): 15%|ββ | 38/250 [01:47<08:21, 2.37s/it]
Training 1/1 epoch (loss 3.0978): 15%|ββ | 38/250 [01:49<08:21, 2.37s/it]
Training 1/1 epoch (loss 3.0978): 16%|ββ | 39/250 [01:49<08:40, 2.47s/it]
Training 1/1 epoch (loss 3.2155): 16%|ββ | 39/250 [01:52<08:40, 2.47s/it]
Training 1/1 epoch (loss 3.2155): 16%|ββ | 40/250 [01:52<09:06, 2.60s/it]
Training 1/1 epoch (loss 2.8237): 16%|ββ | 40/250 [01:55<09:06, 2.60s/it]
Training 1/1 epoch (loss 2.8237): 16%|ββ | 41/250 [01:55<09:04, 2.61s/it]
Training 1/1 epoch (loss 2.7931): 16%|ββ | 41/250 [01:57<09:04, 2.61s/it]
Training 1/1 epoch (loss 2.7931): 17%|ββ | 42/250 [01:57<09:03, 2.61s/it]
Training 1/1 epoch (loss 2.8181): 17%|ββ | 42/250 [02:00<09:03, 2.61s/it]
Training 1/1 epoch (loss 2.8181): 17%|ββ | 43/250 [02:00<08:40, 2.51s/it]
Training 1/1 epoch (loss 2.9409): 17%|ββ | 43/250 [02:02<08:40, 2.51s/it]
Training 1/1 epoch (loss 2.9409): 18%|ββ | 44/250 [02:02<08:38, 2.52s/it]
Training 1/1 epoch (loss 2.9013): 18%|ββ | 44/250 [02:05<08:38, 2.52s/it]
Training 1/1 epoch (loss 2.9013): 18%|ββ | 45/250 [02:05<08:25, 2.47s/it]
Training 1/1 epoch (loss 3.0925): 18%|ββ | 45/250 [02:09<08:25, 2.47s/it]
Training 1/1 epoch (loss 3.0925): 18%|ββ | 46/250 [02:09<09:51, 2.90s/it]
Training 1/1 epoch (loss 2.8661): 18%|ββ | 46/250 [02:11<09:51, 2.90s/it]
Training 1/1 epoch (loss 2.8661): 19%|ββ | 47/250 [02:11<09:27, 2.79s/it]
Training 1/1 epoch (loss 2.9558): 19%|ββ | 47/250 [02:14<09:27, 2.79s/it]
Training 1/1 epoch (loss 2.9558): 19%|ββ | 48/250 [02:14<09:55, 2.95s/it]
Training 1/1 epoch (loss 2.6606): 19%|ββ | 48/250 [02:17<09:55, 2.95s/it]
Training 1/1 epoch (loss 2.6606): 20%|ββ | 49/250 [02:17<09:48, 2.93s/it]
Training 1/1 epoch (loss 2.9798): 20%|ββ | 49/250 [02:20<09:48, 2.93s/it]
Training 1/1 epoch (loss 2.9798): 20%|ββ | 50/250 [02:20<09:12, 2.76s/it]
Training 1/1 epoch (loss 2.8114): 20%|ββ | 50/250 [02:20<09:12, 2.76s/it]
Training 1/1 epoch (loss 2.8114): 20%|ββ | 51/250 [02:20<07:09, 2.16s/it]
Training 1/1 epoch (loss 2.7406): 20%|ββ | 51/250 [02:24<07:09, 2.16s/it]
Training 1/1 epoch (loss 2.7406): 21%|ββ | 52/250 [02:24<08:09, 2.47s/it]
Training 1/1 epoch (loss 2.6450): 21%|ββ | 52/250 [02:26<08:09, 2.47s/it]
Training 1/1 epoch (loss 2.6450): 21%|ββ | 53/250 [02:26<07:59, 2.43s/it]
Training 1/1 epoch (loss 2.8103): 21%|ββ | 53/250 [02:31<07:59, 2.43s/it]
Training 1/1 epoch (loss 2.8103): 22%|βββ | 54/250 [02:31<10:12, 3.13s/it]
Training 1/1 epoch (loss 2.9519): 22%|βββ | 54/250 [02:34<10:12, 3.13s/it]
Training 1/1 epoch (loss 2.9519): 22%|βββ | 55/250 [02:34<10:39, 3.28s/it]
Training 1/1 epoch (loss 2.4448): 22%|βββ | 55/250 [02:37<10:39, 3.28s/it]
Training 1/1 epoch (loss 2.4448): 22%|βββ | 56/250 [02:37<10:06, 3.12s/it]
Training 1/1 epoch (loss 2.9490): 22%|βββ | 56/250 [02:40<10:06, 3.12s/it]
Training 1/1 epoch (loss 2.9490): 23%|βββ | 57/250 [02:40<09:25, 2.93s/it]
Training 1/1 epoch (loss 2.7273): 23%|βββ | 57/250 [02:42<09:25, 2.93s/it]
Training 1/1 epoch (loss 2.7273): 23%|βββ | 58/250 [02:42<09:19, 2.91s/it]
Training 1/1 epoch (loss 2.7229): 23%|βββ | 58/250 [02:44<09:19, 2.91s/it]
Training 1/1 epoch (loss 2.7229): 24%|βββ | 59/250 [02:44<07:33, 2.37s/it]
Training 1/1 epoch (loss 2.8008): 24%|βββ | 59/250 [02:46<07:33, 2.37s/it]
Training 1/1 epoch (loss 2.8008): 24%|βββ | 60/250 [02:46<07:06, 2.24s/it]
Training 1/1 epoch (loss 2.7537): 24%|βββ | 60/250 [02:49<07:06, 2.24s/it]
Training 1/1 epoch (loss 2.7537): 24%|βββ | 61/250 [02:49<07:48, 2.48s/it]
Training 1/1 epoch (loss 2.7295): 24%|βββ | 61/250 [02:51<07:48, 2.48s/it]
Training 1/1 epoch (loss 2.7295): 25%|βββ | 62/250 [02:51<07:46, 2.48s/it]
Training 1/1 epoch (loss 2.8806): 25%|βββ | 62/250 [02:52<07:46, 2.48s/it]
Training 1/1 epoch (loss 2.8806): 25%|βββ | 63/250 [02:52<06:47, 2.18s/it]
Training 1/1 epoch (loss 2.5973): 25%|βββ | 63/250 [02:56<06:47, 2.18s/it]
Training 1/1 epoch (loss 2.5973): 26%|βββ | 64/250 [02:56<08:12, 2.65s/it]
Training 1/1 epoch (loss 2.7663): 26%|βββ | 64/250 [02:58<08:12, 2.65s/it]
Training 1/1 epoch (loss 2.7663): 26%|βββ | 65/250 [02:58<07:28, 2.43s/it]
Training 1/1 epoch (loss 2.8764): 26%|βββ | 65/250 [03:00<07:28, 2.43s/it]
Training 1/1 epoch (loss 2.8764): 26%|βββ | 66/250 [03:00<06:46, 2.21s/it]
Training 1/1 epoch (loss 2.5991): 26%|βββ | 66/250 [03:02<06:46, 2.21s/it]
Training 1/1 epoch (loss 2.5991): 27%|βββ | 67/250 [03:02<06:17, 2.06s/it]
Training 1/1 epoch (loss 2.6193): 27%|βββ | 67/250 [03:04<06:17, 2.06s/it]
Training 1/1 epoch (loss 2.6193): 27%|βββ | 68/250 [03:04<06:55, 2.28s/it]
Training 1/1 epoch (loss 2.6791): 27%|βββ | 68/250 [03:08<06:55, 2.28s/it]
Training 1/1 epoch (loss 2.6791): 28%|βββ | 69/250 [03:08<08:23, 2.78s/it]
Training 1/1 epoch (loss 2.9865): 28%|βββ | 69/250 [03:10<08:23, 2.78s/it]
Training 1/1 epoch (loss 2.9865): 28%|βββ | 70/250 [03:10<07:34, 2.53s/it]
Training 1/1 epoch (loss 3.0012): 28%|βββ | 70/250 [03:12<07:34, 2.53s/it]
Training 1/1 epoch (loss 3.0012): 28%|βββ | 71/250 [03:12<07:07, 2.39s/it]
Training 1/1 epoch (loss 2.6807): 28%|βββ | 71/250 [03:15<07:07, 2.39s/it]
Training 1/1 epoch (loss 2.6807): 29%|βββ | 72/250 [03:15<07:37, 2.57s/it]
Training 1/1 epoch (loss 2.7189): 29%|βββ | 72/250 [03:18<07:37, 2.57s/it]
Training 1/1 epoch (loss 2.7189): 29%|βββ | 73/250 [03:18<07:57, 2.70s/it]
Training 1/1 epoch (loss 2.8468): 29%|βββ | 73/250 [03:21<07:57, 2.70s/it]
Training 1/1 epoch (loss 2.8468): 30%|βββ | 74/250 [03:21<08:10, 2.79s/it]
Training 1/1 epoch (loss 3.0295): 30%|βββ | 74/250 [03:23<08:10, 2.79s/it]
Training 1/1 epoch (loss 3.0295): 30%|βββ | 75/250 [03:23<07:21, 2.53s/it]
Training 1/1 epoch (loss 2.8683): 30%|βββ | 75/250 [03:26<07:21, 2.53s/it]
Training 1/1 epoch (loss 2.8683): 30%|βββ | 76/250 [03:26<07:51, 2.71s/it]
Training 1/1 epoch (loss 2.7244): 30%|βββ | 76/250 [03:30<07:51, 2.71s/it]
Training 1/1 epoch (loss 2.7244): 31%|βββ | 77/250 [03:30<08:47, 3.05s/it]
Training 1/1 epoch (loss 2.8312): 31%|βββ | 77/250 [03:33<08:47, 3.05s/it]
Training 1/1 epoch (loss 2.8312): 31%|βββ | 78/250 [03:33<08:25, 2.94s/it]
Training 1/1 epoch (loss 2.9940): 31%|βββ | 78/250 [03:34<08:25, 2.94s/it]
Training 1/1 epoch (loss 2.9940): 32%|ββββ | 79/250 [03:34<06:44, 2.36s/it]
Training 1/1 epoch (loss 2.7058): 32%|ββββ | 79/250 [03:39<06:44, 2.36s/it]
Training 1/1 epoch (loss 2.7058): 32%|ββββ | 80/250 [03:39<08:41, 3.07s/it]
Training 1/1 epoch (loss 2.8125): 32%|ββββ | 80/250 [03:43<08:41, 3.07s/it]
Training 1/1 epoch (loss 2.8125): 32%|ββββ | 81/250 [03:43<09:41, 3.44s/it]
Training 1/1 epoch (loss 3.1524): 32%|ββββ | 81/250 [03:46<09:41, 3.44s/it]
Training 1/1 epoch (loss 3.1524): 33%|ββββ | 82/250 [03:46<09:40, 3.46s/it]
Training 1/1 epoch (loss 2.9511): 33%|ββββ | 82/250 [03:49<09:40, 3.46s/it]
Training 1/1 epoch (loss 2.9511): 33%|ββββ | 83/250 [03:49<08:34, 3.08s/it]
Training 1/1 epoch (loss 3.0209): 33%|ββββ | 83/250 [03:53<08:34, 3.08s/it]
Training 1/1 epoch (loss 3.0209): 34%|ββββ | 84/250 [03:53<09:18, 3.37s/it]
Training 1/1 epoch (loss 3.0283): 34%|ββββ | 84/250 [03:58<09:18, 3.37s/it]
Training 1/1 epoch (loss 3.0283): 34%|ββββ | 85/250 [03:58<10:54, 3.97s/it]
Training 1/1 epoch (loss 3.0775): 34%|ββββ | 85/250 [04:01<10:54, 3.97s/it]
Training 1/1 epoch (loss 3.0775): 34%|ββββ | 86/250 [04:01<09:48, 3.59s/it]
Training 1/1 epoch (loss 2.9057): 34%|ββββ | 86/250 [04:02<09:48, 3.59s/it]
Training 1/1 epoch (loss 2.9057): 35%|ββββ | 87/250 [04:02<08:10, 3.01s/it]
Training 1/1 epoch (loss 2.8234): 35%|ββββ | 87/250 [04:05<08:10, 3.01s/it]
Training 1/1 epoch (loss 2.8234): 35%|ββββ | 88/250 [04:05<07:26, 2.76s/it]
Training 1/1 epoch (loss 2.9026): 35%|ββββ | 88/250 [04:08<07:26, 2.76s/it]
Training 1/1 epoch (loss 2.9026): 36%|ββββ | 89/250 [04:08<07:41, 2.87s/it]
Training 1/1 epoch (loss 3.0205): 36%|ββββ | 89/250 [04:11<07:41, 2.87s/it]
Training 1/1 epoch (loss 3.0205): 36%|ββββ | 90/250 [04:11<07:45, 2.91s/it]
Training 1/1 epoch (loss 2.7437): 36%|ββββ | 90/250 [04:13<07:45, 2.91s/it]
Training 1/1 epoch (loss 2.7437): 36%|ββββ | 91/250 [04:13<07:23, 2.79s/it]
Training 1/1 epoch (loss 2.8821): 36%|ββββ | 91/250 [04:15<07:23, 2.79s/it]
Training 1/1 epoch (loss 2.8821): 37%|ββββ | 92/250 [04:15<06:39, 2.53s/it]
Training 1/1 epoch (loss 2.9545): 37%|ββββ | 92/250 [04:18<06:39, 2.53s/it]
Training 1/1 epoch (loss 2.9545): 37%|ββββ | 93/250 [04:18<06:36, 2.53s/it]
Training 1/1 epoch (loss 2.9839): 37%|ββββ | 93/250 [04:20<06:36, 2.53s/it]
Training 1/1 epoch (loss 2.9839): 38%|ββββ | 94/250 [04:20<06:21, 2.45s/it]
Training 1/1 epoch (loss 2.7861): 38%|ββββ | 94/250 [04:22<06:21, 2.45s/it]
Training 1/1 epoch (loss 2.7861): 38%|ββββ | 95/250 [04:22<06:16, 2.43s/it]
Training 1/1 epoch (loss 2.7764): 38%|ββββ | 95/250 [04:26<06:16, 2.43s/it]
Training 1/1 epoch (loss 2.7764): 38%|ββββ | 96/250 [04:26<07:05, 2.76s/it]
Training 1/1 epoch (loss 2.8814): 38%|ββββ | 96/250 [04:28<07:05, 2.76s/it]
Training 1/1 epoch (loss 2.8814): 39%|ββββ | 97/250 [04:28<06:39, 2.61s/it]
Training 1/1 epoch (loss 3.0947): 39%|ββββ | 97/250 [04:31<06:39, 2.61s/it]
Training 1/1 epoch (loss 3.0947): 39%|ββββ | 98/250 [04:31<06:33, 2.59s/it]
Training 1/1 epoch (loss 2.8395): 39%|ββββ | 98/250 [04:33<06:33, 2.59s/it]
Training 1/1 epoch (loss 2.8395): 40%|ββββ | 99/250 [04:33<06:42, 2.66s/it]
Training 1/1 epoch (loss 2.8365): 40%|ββββ | 99/250 [04:35<06:42, 2.66s/it]
Training 1/1 epoch (loss 2.8365): 40%|ββββ | 100/250 [04:35<06:01, 2.41s/it]
Training 1/1 epoch (loss 2.7291): 40%|ββββ | 100/250 [04:37<06:01, 2.41s/it]
Training 1/1 epoch (loss 2.7291): 40%|ββββ | 101/250 [04:37<05:45, 2.32s/it]
Training 1/1 epoch (loss 2.6684): 40%|ββββ | 101/250 [04:40<05:45, 2.32s/it]
Training 1/1 epoch (loss 2.6684): 41%|ββββ | 102/250 [04:40<06:14, 2.53s/it]
Training 1/1 epoch (loss 2.7536): 41%|ββββ | 102/250 [04:42<06:14, 2.53s/it]
Training 1/1 epoch (loss 2.7536): 41%|ββββ | 103/250 [04:42<05:47, 2.37s/it]
Training 1/1 epoch (loss 2.8876): 41%|ββββ | 103/250 [04:44<05:47, 2.37s/it]
Training 1/1 epoch (loss 2.8876): 42%|βββββ | 104/250 [04:44<05:19, 2.19s/it]
Training 1/1 epoch (loss 2.7935): 42%|βββββ | 104/250 [04:47<05:19, 2.19s/it]
Training 1/1 epoch (loss 2.7935): 42%|βββββ | 105/250 [04:47<05:58, 2.47s/it]
Training 1/1 epoch (loss 2.9320): 42%|βββββ | 105/250 [04:52<05:58, 2.47s/it]
Training 1/1 epoch (loss 2.9320): 42%|βββββ | 106/250 [04:52<07:11, 2.99s/it]
Training 1/1 epoch (loss 2.8468): 42%|βββββ | 106/250 [04:54<07:11, 2.99s/it]
Training 1/1 epoch (loss 2.8468): 43%|βββββ | 107/250 [04:54<07:06, 2.99s/it]
Training 1/1 epoch (loss 2.8827): 43%|βββββ | 107/250 [04:56<07:06, 2.99s/it]
Training 1/1 epoch (loss 2.8827): 43%|βββββ | 108/250 [04:56<06:15, 2.64s/it]
Training 1/1 epoch (loss 2.7893): 43%|βββββ | 108/250 [05:00<06:15, 2.64s/it]
Training 1/1 epoch (loss 2.7893): 44%|βββββ | 109/250 [05:00<07:07, 3.03s/it]
Training 1/1 epoch (loss 2.6977): 44%|βββββ | 109/250 [05:04<07:07, 3.03s/it]
Training 1/1 epoch (loss 2.6977): 44%|βββββ | 110/250 [05:04<07:24, 3.17s/it]
Training 1/1 epoch (loss 2.9399): 44%|βββββ | 110/250 [05:07<07:24, 3.17s/it]
Training 1/1 epoch (loss 2.9399): 44%|βββββ | 111/250 [05:07<07:08, 3.08s/it]
Training 1/1 epoch (loss 2.8776): 44%|βββββ | 111/250 [05:10<07:08, 3.08s/it]
Training 1/1 epoch (loss 2.8776): 45%|βββββ | 112/250 [05:10<07:16, 3.16s/it]
Training 1/1 epoch (loss 2.6751): 45%|βββββ | 112/250 [05:13<07:16, 3.16s/it]
Training 1/1 epoch (loss 2.6751): 45%|βββββ | 113/250 [05:13<07:06, 3.11s/it]
Training 1/1 epoch (loss 3.1333): 45%|βββββ | 113/250 [05:16<07:06, 3.11s/it]
Training 1/1 epoch (loss 3.1333): 46%|βββββ | 114/250 [05:16<07:13, 3.19s/it]
Training 1/1 epoch (loss 3.0715): 46%|βββββ | 114/250 [05:19<07:13, 3.19s/it]
Training 1/1 epoch (loss 3.0715): 46%|βββββ | 115/250 [05:19<07:01, 3.13s/it]
Training 1/1 epoch (loss 2.7164): 46%|βββββ | 115/250 [05:22<07:01, 3.13s/it]
Training 1/1 epoch (loss 2.7164): 46%|βββββ | 116/250 [05:22<06:25, 2.88s/it]
Training 1/1 epoch (loss 2.8466): 46%|βββββ | 116/250 [05:25<06:25, 2.88s/it]
Training 1/1 epoch (loss 2.8466): 47%|βββββ | 117/250 [05:25<06:25, 2.90s/it]
Training 1/1 epoch (loss 2.4414): 47%|βββββ | 117/250 [05:26<06:25, 2.90s/it]
Training 1/1 epoch (loss 2.4414): 47%|βββββ | 118/250 [05:26<05:42, 2.59s/it]
Training 1/1 epoch (loss 2.7808): 47%|βββββ | 118/250 [05:28<05:42, 2.59s/it]
Training 1/1 epoch (loss 2.7808): 48%|βββββ | 119/250 [05:28<05:08, 2.36s/it]
Training 1/1 epoch (loss 2.9190): 48%|βββββ | 119/250 [05:30<05:08, 2.36s/it]
Training 1/1 epoch (loss 2.9190): 48%|βββββ | 120/250 [05:30<04:58, 2.29s/it]
Training 1/1 epoch (loss 2.5347): 48%|βββββ | 120/250 [05:32<04:58, 2.29s/it]
Training 1/1 epoch (loss 2.5347): 48%|βββββ | 121/250 [05:32<04:34, 2.13s/it]
Training 1/1 epoch (loss 2.7217): 48%|βββββ | 121/250 [05:34<04:34, 2.13s/it]
Training 1/1 epoch (loss 2.7217): 49%|βββββ | 122/250 [05:34<04:20, 2.04s/it]
Training 1/1 epoch (loss 2.7363): 49%|βββββ | 122/250 [05:37<04:20, 2.04s/it]
Training 1/1 epoch (loss 2.7363): 49%|βββββ | 123/250 [05:37<05:04, 2.40s/it]
Training 1/1 epoch (loss 2.7422): 49%|βββββ | 123/250 [05:41<05:04, 2.40s/it]
Training 1/1 epoch (loss 2.7422): 50%|βββββ | 124/250 [05:41<05:42, 2.72s/it]
Training 1/1 epoch (loss 2.8673): 50%|βββββ | 124/250 [05:43<05:42, 2.72s/it]
Training 1/1 epoch (loss 2.8673): 50%|βββββ | 125/250 [05:43<05:15, 2.53s/it]
Training 1/1 epoch (loss 2.7436): 50%|βββββ | 125/250 [05:44<05:15, 2.53s/it]
Training 1/1 epoch (loss 2.7436): 50%|βββββ | 126/250 [05:44<04:40, 2.26s/it]
Training 1/1 epoch (loss 2.9046): 50%|βββββ | 126/250 [05:45<04:40, 2.26s/it]
Training 1/1 epoch (loss 2.9046): 51%|βββββ | 127/250 [05:45<03:41, 1.80s/it]
Training 1/1 epoch (loss 2.8486): 51%|βββββ | 127/250 [05:48<03:41, 1.80s/it]
Training 1/1 epoch (loss 2.8486): 51%|βββββ | 128/250 [05:48<04:02, 1.98s/it]
Training 1/1 epoch (loss 2.8017): 51%|βββββ | 128/250 [05:50<04:02, 1.98s/it]
Training 1/1 epoch (loss 2.8017): 52%|ββββββ | 129/250 [05:50<04:08, 2.06s/it]
Training 1/1 epoch (loss 3.2179): 52%|ββββββ | 129/250 [05:53<04:08, 2.06s/it]
Training 1/1 epoch (loss 3.2179): 52%|ββββββ | 130/250 [05:53<04:44, 2.37s/it]
Training 1/1 epoch (loss 2.9357): 52%|ββββββ | 130/250 [05:54<04:44, 2.37s/it]
Training 1/1 epoch (loss 2.9357): 52%|ββββββ | 131/250 [05:54<04:15, 2.15s/it]
Training 1/1 epoch (loss 2.7856): 52%|ββββββ | 131/250 [05:56<04:15, 2.15s/it]
Training 1/1 epoch (loss 2.7856): 53%|ββββββ | 132/250 [05:56<03:54, 1.99s/it]
Training 1/1 epoch (loss 2.7349): 53%|ββββββ | 132/250 [05:59<03:54, 1.99s/it]
Training 1/1 epoch (loss 2.7349): 53%|ββββββ | 133/250 [05:59<04:27, 2.28s/it]
Training 1/1 epoch (loss 2.9361): 53%|ββββββ | 133/250 [06:01<04:27, 2.28s/it]
Training 1/1 epoch (loss 2.9361): 54%|ββββββ | 134/250 [06:01<04:26, 2.30s/it]
Training 1/1 epoch (loss 2.6049): 54%|ββββββ | 134/250 [06:04<04:26, 2.30s/it]
Training 1/1 epoch (loss 2.6049): 54%|ββββββ | 135/250 [06:04<04:37, 2.41s/it]
Training 1/1 epoch (loss 2.9501): 54%|ββββββ | 135/250 [06:08<04:37, 2.41s/it]
Training 1/1 epoch (loss 2.9501): 54%|ββββββ | 136/250 [06:08<05:14, 2.76s/it]
Training 1/1 epoch (loss 2.8105): 54%|ββββββ | 136/250 [06:11<05:14, 2.76s/it]
Training 1/1 epoch (loss 2.8105): 55%|ββββββ | 137/250 [06:11<05:41, 3.02s/it]
Training 1/1 epoch (loss 2.9637): 55%|ββββββ | 137/250 [06:16<05:41, 3.02s/it]
Training 1/1 epoch (loss 2.9637): 55%|ββββββ | 138/250 [06:16<06:24, 3.43s/it]
Training 1/1 epoch (loss 2.6029): 55%|ββββββ | 138/250 [06:20<06:24, 3.43s/it]
Training 1/1 epoch (loss 2.6029): 56%|ββββββ | 139/250 [06:20<07:03, 3.81s/it]
Training 1/1 epoch (loss 3.0995): 56%|ββββββ | 139/250 [06:22<07:03, 3.81s/it]
Training 1/1 epoch (loss 3.0995): 56%|ββββββ | 140/250 [06:22<05:32, 3.02s/it]
Training 1/1 epoch (loss 2.8976): 56%|ββββββ | 140/250 [06:23<05:32, 3.02s/it]
Training 1/1 epoch (loss 2.8976): 56%|ββββββ | 141/250 [06:23<04:47, 2.63s/it]
Training 1/1 epoch (loss 3.0176): 56%|ββββββ | 141/250 [06:27<04:47, 2.63s/it]
Training 1/1 epoch (loss 3.0176): 57%|ββββββ | 142/250 [06:27<05:30, 3.06s/it]
Training 1/1 epoch (loss 2.9054): 57%|ββββββ | 142/250 [06:30<05:30, 3.06s/it]
Training 1/1 epoch (loss 2.9054): 57%|ββββββ | 143/250 [06:30<05:25, 3.04s/it]
Training 1/1 epoch (loss 2.7631): 57%|ββββββ | 143/250 [06:33<05:25, 3.04s/it]
Training 1/1 epoch (loss 2.7631): 58%|ββββββ | 144/250 [06:33<05:11, 2.94s/it]
Training 1/1 epoch (loss 2.8760): 58%|ββββββ | 144/250 [06:36<05:11, 2.94s/it]
Training 1/1 epoch (loss 2.8760): 58%|ββββββ | 145/250 [06:36<04:58, 2.84s/it]
Training 1/1 epoch (loss 2.7830): 58%|ββββββ | 145/250 [06:40<04:58, 2.84s/it]
Training 1/1 epoch (loss 2.7830): 58%|ββββββ | 146/250 [06:40<05:46, 3.33s/it]
Training 1/1 epoch (loss 2.9812): 58%|ββββββ | 146/250 [06:42<05:46, 3.33s/it]
Training 1/1 epoch (loss 2.9812): 59%|ββββββ | 147/250 [06:42<05:13, 3.04s/it]
Training 1/1 epoch (loss 2.8327): 59%|ββββββ | 147/250 [06:46<05:13, 3.04s/it]
Training 1/1 epoch (loss 2.8327): 59%|ββββββ | 148/250 [06:46<05:19, 3.14s/it]
Training 1/1 epoch (loss 2.9768): 59%|ββββββ | 148/250 [06:48<05:19, 3.14s/it]
Training 1/1 epoch (loss 2.9768): 60%|ββββββ | 149/250 [06:48<04:49, 2.87s/it]
Training 1/1 epoch (loss 2.8298): 60%|ββββββ | 149/250 [06:53<04:49, 2.87s/it]
Training 1/1 epoch (loss 2.8298): 60%|ββββββ | 150/250 [06:53<05:58, 3.59s/it]
Training 1/1 epoch (loss 2.9207): 60%|ββββββ | 150/250 [06:56<05:58, 3.59s/it]
Training 1/1 epoch (loss 2.9207): 60%|ββββββ | 151/250 [06:56<05:18, 3.22s/it]
Training 1/1 epoch (loss 2.9840): 60%|ββββββ | 151/250 [06:59<05:18, 3.22s/it]
Training 1/1 epoch (loss 2.9840): 61%|ββββββ | 152/250 [06:59<05:14, 3.21s/it]
Training 1/1 epoch (loss 2.8436): 61%|ββββββ | 152/250 [07:02<05:14, 3.21s/it]
Training 1/1 epoch (loss 2.8436): 61%|ββββββ | 153/250 [07:02<04:56, 3.06s/it]
Training 1/1 epoch (loss 2.8238): 61%|ββββββ | 153/250 [07:05<04:56, 3.06s/it]
Training 1/1 epoch (loss 2.8238): 62%|βββββββ | 154/250 [07:05<04:57, 3.10s/it]
Training 1/1 epoch (loss 3.1034): 62%|βββββββ | 154/250 [07:08<04:57, 3.10s/it]
Training 1/1 epoch (loss 3.1034): 62%|βββββββ | 155/250 [07:08<04:44, 2.99s/it]
Training 1/1 epoch (loss 2.7994): 62%|βββββββ | 155/250 [07:09<04:44, 2.99s/it]
Training 1/1 epoch (loss 2.7994): 62%|βββββββ | 156/250 [07:09<04:06, 2.62s/it]
Training 1/1 epoch (loss 2.8647): 62%|βββββββ | 156/250 [07:12<04:06, 2.62s/it]
Training 1/1 epoch (loss 2.8647): 63%|βββββββ | 157/250 [07:12<04:01, 2.60s/it]
Training 1/1 epoch (loss 2.8498): 63%|βββββββ | 157/250 [07:14<04:01, 2.60s/it]
Training 1/1 epoch (loss 2.8498): 63%|βββββββ | 158/250 [07:14<03:51, 2.52s/it]
Training 1/1 epoch (loss 3.0246): 63%|βββββββ | 158/250 [07:15<03:51, 2.52s/it]
Training 1/1 epoch (loss 3.0246): 64%|βββββββ | 159/250 [07:15<02:59, 1.97s/it]
Training 1/1 epoch (loss 2.5181): 64%|βββββββ | 159/250 [07:18<02:59, 1.97s/it]
Training 1/1 epoch (loss 2.5181): 64%|βββββββ | 160/250 [07:18<03:31, 2.35s/it]
Training 1/1 epoch (loss 3.1598): 64%|βββββββ | 160/250 [07:20<03:31, 2.35s/it]
Training 1/1 epoch (loss 3.1598): 64%|βββββββ | 161/250 [07:20<03:22, 2.28s/it]
Training 1/1 epoch (loss 2.6931): 64%|βββββββ | 161/250 [07:23<03:22, 2.28s/it]
Training 1/1 epoch (loss 2.6931): 65%|βββββββ | 162/250 [07:23<03:35, 2.45s/it]
Training 1/1 epoch (loss 2.9550): 65%|βββββββ | 162/250 [07:27<03:35, 2.45s/it]
Training 1/1 epoch (loss 2.9550): 65%|βββββββ | 163/250 [07:27<04:12, 2.90s/it]
Training 1/1 epoch (loss 2.8919): 65%|βββββββ | 163/250 [07:30<04:12, 2.90s/it]
Training 1/1 epoch (loss 2.8919): 66%|βββββββ | 164/250 [07:30<04:15, 2.97s/it]
Training 1/1 epoch (loss 2.6619): 66%|βββββββ | 164/250 [07:33<04:15, 2.97s/it]
Training 1/1 epoch (loss 2.6619): 66%|βββββββ | 165/250 [07:33<04:10, 2.94s/it]
Training 1/1 epoch (loss 2.5838): 66%|βββββββ | 165/250 [07:37<04:10, 2.94s/it]
Training 1/1 epoch (loss 2.5838): 66%|βββββββ | 166/250 [07:37<04:29, 3.21s/it]
Training 1/1 epoch (loss 2.9172): 66%|βββββββ | 166/250 [07:40<04:29, 3.21s/it]
Training 1/1 epoch (loss 2.9172): 67%|βββββββ | 167/250 [07:40<04:27, 3.22s/it]
Training 1/1 epoch (loss 2.8277): 67%|βββββββ | 167/250 [07:43<04:27, 3.22s/it]
Training 1/1 epoch (loss 2.8277): 67%|βββββββ | 168/250 [07:43<04:16, 3.12s/it]
Training 1/1 epoch (loss 2.9423): 67%|βββββββ | 168/250 [07:46<04:16, 3.12s/it]
Training 1/1 epoch (loss 2.9423): 68%|βββββββ | 169/250 [07:46<04:10, 3.10s/it]
Training 1/1 epoch (loss 2.7279): 68%|βββββββ | 169/250 [07:48<04:10, 3.10s/it]
Training 1/1 epoch (loss 2.7279): 68%|βββββββ | 170/250 [07:48<03:33, 2.67s/it]
Training 1/1 epoch (loss 3.0384): 68%|βββββββ | 170/250 [07:51<03:33, 2.67s/it]
Training 1/1 epoch (loss 3.0384): 68%|βββββββ | 171/250 [07:51<03:34, 2.72s/it]
Training 1/1 epoch (loss 2.7208): 68%|βββββββ | 171/250 [07:52<03:34, 2.72s/it]
Training 1/1 epoch (loss 2.7208): 69%|βββββββ | 172/250 [07:52<03:00, 2.31s/it]
Training 1/1 epoch (loss 2.4613): 69%|βββββββ | 172/250 [07:53<03:00, 2.31s/it]
Training 1/1 epoch (loss 2.4613): 69%|βββββββ | 173/250 [07:53<02:32, 1.98s/it]
Training 1/1 epoch (loss 2.9910): 69%|βββββββ | 173/250 [07:56<02:32, 1.98s/it]
Training 1/1 epoch (loss 2.9910): 70%|βββββββ | 174/250 [07:56<02:44, 2.16s/it]
Training 1/1 epoch (loss 2.9104): 70%|βββββββ | 174/250 [07:57<02:44, 2.16s/it]
Training 1/1 epoch (loss 2.9104): 70%|βββββββ | 175/250 [07:57<02:17, 1.83s/it]
Training 1/1 epoch (loss 2.7667): 70%|βββββββ | 175/250 [07:59<02:17, 1.83s/it]
Training 1/1 epoch (loss 2.7667): 70%|βββββββ | 176/250 [07:59<02:34, 2.08s/it]
Training 1/1 epoch (loss 2.8035): 70%|βββββββ | 176/250 [08:02<02:34, 2.08s/it]
Training 1/1 epoch (loss 2.8035): 71%|βββββββ | 177/250 [08:02<02:36, 2.14s/it]
Training 1/1 epoch (loss 2.6662): 71%|βββββββ | 177/250 [08:05<02:36, 2.14s/it]
Training 1/1 epoch (loss 2.6662): 71%|βββββββ | 178/250 [08:05<02:57, 2.46s/it]
Training 1/1 epoch (loss 2.9735): 71%|βββββββ | 178/250 [08:07<02:57, 2.46s/it]
Training 1/1 epoch (loss 2.9735): 72%|ββββββββ | 179/250 [08:07<02:52, 2.43s/it]
Training 1/1 epoch (loss 2.9929): 72%|ββββββββ | 179/250 [08:10<02:52, 2.43s/it]
Training 1/1 epoch (loss 2.9929): 72%|ββββββββ | 180/250 [08:10<03:01, 2.60s/it]
Training 1/1 epoch (loss 2.6220): 72%|ββββββββ | 180/250 [08:14<03:01, 2.60s/it]
Training 1/1 epoch (loss 2.6220): 72%|ββββββββ | 181/250 [08:14<03:20, 2.90s/it]
Training 1/1 epoch (loss 2.8473): 72%|ββββββββ | 181/250 [08:17<03:20, 2.90s/it]
Training 1/1 epoch (loss 2.8473): 73%|ββββββββ | 182/250 [08:17<03:20, 2.96s/it]
Training 1/1 epoch (loss 2.7560): 73%|ββββββββ | 182/250 [08:20<03:20, 2.96s/it]
Training 1/1 epoch (loss 2.7560): 73%|ββββββββ | 183/250 [08:20<03:21, 3.01s/it]
Training 1/1 epoch (loss 2.7678): 73%|ββββββββ | 183/250 [08:24<03:21, 3.01s/it]
Training 1/1 epoch (loss 2.7678): 74%|ββββββββ | 184/250 [08:24<03:32, 3.22s/it]
Training 1/1 epoch (loss 2.7050): 74%|ββββββββ | 184/250 [08:27<03:32, 3.22s/it]
Training 1/1 epoch (loss 2.7050): 74%|ββββββββ | 185/250 [08:27<03:27, 3.19s/it]
Training 1/1 epoch (loss 2.8924): 74%|ββββββββ | 185/250 [08:29<03:27, 3.19s/it]
Training 1/1 epoch (loss 2.8924): 74%|ββββββββ | 186/250 [08:29<03:10, 2.97s/it]
Training 1/1 epoch (loss 2.6790): 74%|ββββββββ | 186/250 [08:34<03:10, 2.97s/it]
Training 1/1 epoch (loss 2.6790): 75%|ββββββββ | 187/250 [08:34<03:32, 3.37s/it]
Training 1/1 epoch (loss 2.9919): 75%|ββββββββ | 187/250 [08:36<03:32, 3.37s/it]
Training 1/1 epoch (loss 2.9919): 75%|ββββββββ | 188/250 [08:36<03:17, 3.19s/it]
Training 1/1 epoch (loss 2.6792): 75%|ββββββββ | 188/250 [08:40<03:17, 3.19s/it]
Training 1/1 epoch (loss 2.6792): 76%|ββββββββ | 189/250 [08:40<03:20, 3.28s/it]
Training 1/1 epoch (loss 2.8049): 76%|ββββββββ | 189/250 [08:42<03:20, 3.28s/it]
Training 1/1 epoch (loss 2.8049): 76%|ββββββββ | 190/250 [08:42<02:57, 2.96s/it]
Training 1/1 epoch (loss 2.8283): 76%|ββββββββ | 190/250 [08:45<02:57, 2.96s/it]
Training 1/1 epoch (loss 2.8283): 76%|ββββββββ | 191/250 [08:45<02:57, 3.00s/it]
Training 1/1 epoch (loss 2.7838): 76%|ββββββββ | 191/250 [08:48<02:57, 3.00s/it]
Training 1/1 epoch (loss 2.7838): 77%|ββββββββ | 192/250 [08:48<02:51, 2.96s/it]
Training 1/1 epoch (loss 2.9824): 77%|ββββββββ | 192/250 [08:50<02:51, 2.96s/it]
Training 1/1 epoch (loss 2.9824): 77%|ββββββββ | 193/250 [08:50<02:37, 2.76s/it]
Training 1/1 epoch (loss 2.7247): 77%|ββββββββ | 193/250 [08:53<02:37, 2.76s/it]
Training 1/1 epoch (loss 2.7247): 78%|ββββββββ | 194/250 [08:53<02:28, 2.66s/it]
Training 1/1 epoch (loss 2.8542): 78%|ββββββββ | 194/250 [08:57<02:28, 2.66s/it]
Training 1/1 epoch (loss 2.8542): 78%|ββββββββ | 195/250 [08:57<02:46, 3.04s/it]
Training 1/1 epoch (loss 2.8117): 78%|ββββββββ | 195/250 [08:59<02:46, 3.04s/it]
Training 1/1 epoch (loss 2.8117): 78%|ββββββββ | 196/250 [08:59<02:39, 2.95s/it]
Training 1/1 epoch (loss 2.7989): 78%|ββββββββ | 196/250 [09:01<02:39, 2.95s/it]
Training 1/1 epoch (loss 2.7989): 79%|ββββββββ | 197/250 [09:01<02:17, 2.59s/it]
Training 1/1 epoch (loss 2.7705): 79%|ββββββββ | 197/250 [09:04<02:17, 2.59s/it]
Training 1/1 epoch (loss 2.7705): 79%|ββββββββ | 198/250 [09:04<02:11, 2.53s/it]
Training 1/1 epoch (loss 3.1158): 79%|ββββββββ | 198/250 [09:06<02:11, 2.53s/it]
Training 1/1 epoch (loss 3.1158): 80%|ββββββββ | 199/250 [09:06<02:10, 2.56s/it]
Training 1/1 epoch (loss 2.7410): 80%|ββββββββ | 199/250 [09:08<02:10, 2.56s/it]
Training 1/1 epoch (loss 2.7410): 80%|ββββββββ | 200/250 [09:08<02:00, 2.42s/it]
Training 1/1 epoch (loss 2.8119): 80%|ββββββββ | 200/250 [09:12<02:00, 2.42s/it]
Training 1/1 epoch (loss 2.8119): 80%|ββββββββ | 201/250 [09:12<02:16, 2.78s/it]
Training 1/1 epoch (loss 2.9302): 80%|ββββββββ | 201/250 [09:14<02:16, 2.78s/it]
Training 1/1 epoch (loss 2.9302): 81%|ββββββββ | 202/250 [09:14<02:04, 2.59s/it]
Training 1/1 epoch (loss 2.7380): 81%|ββββββββ | 202/250 [09:17<02:04, 2.59s/it]
Training 1/1 epoch (loss 2.7380): 81%|ββββββββ | 203/250 [09:17<02:00, 2.56s/it]
Training 1/1 epoch (loss 2.8346): 81%|ββββββββ | 203/250 [09:20<02:00, 2.56s/it]
Training 1/1 epoch (loss 2.8346): 82%|βββββββββ | 204/250 [09:20<02:05, 2.72s/it]
Training 1/1 epoch (loss 2.8632): 82%|βββββββββ | 204/250 [09:23<02:05, 2.72s/it]
Training 1/1 epoch (loss 2.8632): 82%|βββββββββ | 205/250 [09:23<02:13, 2.97s/it]
Training 1/1 epoch (loss 2.5788): 82%|βββββββββ | 205/250 [09:26<02:13, 2.97s/it]
Training 1/1 epoch (loss 2.5788): 82%|βββββββββ | 206/250 [09:26<02:11, 2.99s/it]
Training 1/1 epoch (loss 2.6841): 82%|βββββββββ | 206/250 [09:30<02:11, 2.99s/it]
Training 1/1 epoch (loss 2.6841): 83%|βββββββββ | 207/250 [09:30<02:23, 3.35s/it]
Training 1/1 epoch (loss 2.8335): 83%|βββββββββ | 207/250 [09:34<02:23, 3.35s/it]
Training 1/1 epoch (loss 2.8335): 83%|βββββββββ | 208/250 [09:34<02:24, 3.44s/it]
Training 1/1 epoch (loss 2.9406): 83%|βββββββββ | 208/250 [09:37<02:24, 3.44s/it]
Training 1/1 epoch (loss 2.9406): 84%|βββββββββ | 209/250 [09:37<02:12, 3.23s/it]
Training 1/1 epoch (loss 2.6030): 84%|βββββββββ | 209/250 [09:39<02:12, 3.23s/it]
Training 1/1 epoch (loss 2.6030): 84%|βββββββββ | 210/250 [09:39<02:01, 3.05s/it]
Training 1/1 epoch (loss 2.6823): 84%|βββββββββ | 210/250 [09:43<02:01, 3.05s/it]
Training 1/1 epoch (loss 2.6823): 84%|βββββββββ | 211/250 [09:43<01:59, 3.06s/it]
Training 1/1 epoch (loss 3.0763): 84%|βββββββββ | 211/250 [09:46<01:59, 3.06s/it]
Training 1/1 epoch (loss 3.0763): 85%|βββββββββ | 212/250 [09:46<02:02, 3.22s/it]
Training 1/1 epoch (loss 3.0598): 85%|βββββββββ | 212/250 [09:48<02:02, 3.22s/it]
Training 1/1 epoch (loss 3.0598): 85%|βββββββββ | 213/250 [09:48<01:42, 2.78s/it]
Training 1/1 epoch (loss 2.9553): 85%|βββββββββ | 213/250 [09:50<01:42, 2.78s/it]
Training 1/1 epoch (loss 2.9553): 86%|βββββββββ | 214/250 [09:50<01:35, 2.66s/it]
Training 1/1 epoch (loss 2.7645): 86%|βββββββββ | 214/250 [09:54<01:35, 2.66s/it]
Training 1/1 epoch (loss 2.7645): 86%|βββββββββ | 215/250 [09:54<01:39, 2.84s/it]
Training 1/1 epoch (loss 2.7093): 86%|βββββββββ | 215/250 [09:56<01:39, 2.84s/it]
Training 1/1 epoch (loss 2.7093): 86%|βββββββββ | 216/250 [09:56<01:35, 2.79s/it]
Training 1/1 epoch (loss 2.8465): 86%|βββββββββ | 216/250 [09:59<01:35, 2.79s/it]
Training 1/1 epoch (loss 2.8465): 87%|βββββββββ | 217/250 [09:59<01:27, 2.65s/it]
Training 1/1 epoch (loss 2.8174): 87%|βββββββββ | 217/250 [10:01<01:27, 2.65s/it]
Training 1/1 epoch (loss 2.8174): 87%|βββββββββ | 218/250 [10:01<01:22, 2.58s/it]
Training 1/1 epoch (loss 2.6050): 87%|βββββββββ | 218/250 [10:05<01:22, 2.58s/it]
Training 1/1 epoch (loss 2.6050): 88%|βββββββββ | 219/250 [10:05<01:30, 2.92s/it]
Training 1/1 epoch (loss 2.6841): 88%|βββββββββ | 219/250 [10:07<01:30, 2.92s/it]
Training 1/1 epoch (loss 2.6841): 88%|βββββββββ | 220/250 [10:07<01:25, 2.84s/it]
Training 1/1 epoch (loss 2.9020): 88%|βββββββββ | 220/250 [10:11<01:25, 2.84s/it]
Training 1/1 epoch (loss 2.9020): 88%|βββββββββ | 221/250 [10:11<01:29, 3.10s/it]
Training 1/1 epoch (loss 2.9884): 88%|βββββββββ | 221/250 [10:15<01:29, 3.10s/it]
Training 1/1 epoch (loss 2.9884): 89%|βββββββββ | 222/250 [10:15<01:30, 3.23s/it]
Training 1/1 epoch (loss 2.7729): 89%|βββββββββ | 222/250 [10:16<01:30, 3.23s/it]
Training 1/1 epoch (loss 2.7729): 89%|βββββββββ | 223/250 [10:16<01:15, 2.81s/it]
Training 1/1 epoch (loss 2.5094): 89%|βββββββββ | 223/250 [10:19<01:15, 2.81s/it]
Training 1/1 epoch (loss 2.5094): 90%|βββββββββ | 224/250 [10:19<01:11, 2.77s/it]
Training 1/1 epoch (loss 2.8668): 90%|βββββββββ | 224/250 [10:23<01:11, 2.77s/it]
Training 1/1 epoch (loss 2.8668): 90%|βββββββββ | 225/250 [10:23<01:15, 3.00s/it]
Training 1/1 epoch (loss 3.1567): 90%|βββββββββ | 225/250 [10:24<01:15, 3.00s/it]
Training 1/1 epoch (loss 3.1567): 90%|βββββββββ | 226/250 [10:24<00:57, 2.40s/it]
Training 1/1 epoch (loss 2.7839): 90%|βββββββββ | 226/250 [10:26<00:57, 2.40s/it]
Training 1/1 epoch (loss 2.7839): 91%|βββββββββ | 227/250 [10:26<00:55, 2.42s/it]
Training 1/1 epoch (loss 2.9074): 91%|βββββββββ | 227/250 [10:29<00:55, 2.42s/it]
Training 1/1 epoch (loss 2.9074): 91%|βββββββββ | 228/250 [10:29<00:53, 2.44s/it]
Training 1/1 epoch (loss 2.7418): 91%|βββββββββ | 228/250 [10:30<00:53, 2.44s/it]
Training 1/1 epoch (loss 2.7418): 92%|ββββββββββ| 229/250 [10:30<00:45, 2.18s/it]
Training 1/1 epoch (loss 2.8436): 92%|ββββββββββ| 229/250 [10:32<00:45, 2.18s/it]
Training 1/1 epoch (loss 2.8436): 92%|ββββββββββ| 230/250 [10:32<00:39, 1.96s/it]
Training 1/1 epoch (loss 2.7290): 92%|ββββββββββ| 230/250 [10:36<00:39, 1.96s/it]
Training 1/1 epoch (loss 2.7290): 92%|ββββββββββ| 231/250 [10:36<00:48, 2.57s/it]
Training 1/1 epoch (loss 2.9458): 92%|ββββββββββ| 231/250 [10:38<00:48, 2.57s/it]
Training 1/1 epoch (loss 2.9458): 93%|ββββββββββ| 232/250 [10:38<00:46, 2.56s/it]
Training 1/1 epoch (loss 2.7735): 93%|ββββββββββ| 232/250 [10:40<00:46, 2.56s/it]
Training 1/1 epoch (loss 2.7735): 93%|ββββββββββ| 233/250 [10:40<00:39, 2.34s/it]
Training 1/1 epoch (loss 2.7832): 93%|ββββββββββ| 233/250 [10:44<00:39, 2.34s/it]
Training 1/1 epoch (loss 2.7832): 94%|ββββββββββ| 234/250 [10:44<00:45, 2.82s/it]
Training 1/1 epoch (loss 3.0169): 94%|ββββββββββ| 234/250 [10:47<00:45, 2.82s/it]
Training 1/1 epoch (loss 3.0169): 94%|ββββββββββ| 235/250 [10:47<00:41, 2.77s/it]
Training 1/1 epoch (loss 3.0419): 94%|ββββββββββ| 235/250 [10:50<00:41, 2.77s/it]
Training 1/1 epoch (loss 3.0419): 94%|ββββββββββ| 236/250 [10:50<00:39, 2.83s/it]
Training 1/1 epoch (loss 2.7279): 94%|ββββββββββ| 236/250 [10:51<00:39, 2.83s/it]
Training 1/1 epoch (loss 2.7279): 95%|ββββββββββ| 237/250 [10:51<00:31, 2.42s/it]
Training 1/1 epoch (loss 3.0275): 95%|ββββββββββ| 237/250 [10:54<00:31, 2.42s/it]
Training 1/1 epoch (loss 3.0275): 95%|ββββββββββ| 238/250 [10:54<00:31, 2.66s/it]
Training 1/1 epoch (loss 2.6824): 95%|ββββββββββ| 238/250 [10:56<00:31, 2.66s/it]
Training 1/1 epoch (loss 2.6824): 96%|ββββββββββ| 239/250 [10:56<00:27, 2.54s/it]
Training 1/1 epoch (loss 3.1152): 96%|ββββββββββ| 239/250 [10:59<00:27, 2.54s/it]
Training 1/1 epoch (loss 3.1152): 96%|ββββββββββ| 240/250 [10:59<00:25, 2.58s/it]
Training 1/1 epoch (loss 2.9426): 96%|ββββββββββ| 240/250 [11:01<00:25, 2.58s/it]
Training 1/1 epoch (loss 2.9426): 96%|ββββββββββ| 241/250 [11:01<00:22, 2.49s/it]
Training 1/1 epoch (loss 2.9199): 96%|ββββββββββ| 241/250 [11:04<00:22, 2.49s/it]
Training 1/1 epoch (loss 2.9199): 97%|ββββββββββ| 242/250 [11:04<00:19, 2.41s/it]
Training 1/1 epoch (loss 3.0144): 97%|ββββββββββ| 242/250 [11:05<00:19, 2.41s/it]
Training 1/1 epoch (loss 3.0144): 97%|ββββββββββ| 243/250 [11:05<00:14, 2.05s/it]
Training 1/1 epoch (loss 2.8875): 97%|ββββββββββ| 243/250 [11:07<00:14, 2.05s/it]
Training 1/1 epoch (loss 2.8875): 98%|ββββββββββ| 244/250 [11:07<00:11, 1.99s/it]
Training 1/1 epoch (loss 2.6427): 98%|ββββββββββ| 244/250 [11:08<00:11, 1.99s/it]
Training 1/1 epoch (loss 2.6427): 98%|ββββββββββ| 245/250 [11:08<00:08, 1.79s/it]
Training 1/1 epoch (loss 2.8965): 98%|ββββββββββ| 245/250 [11:12<00:08, 1.79s/it]
Training 1/1 epoch (loss 2.8965): 98%|ββββββββββ| 246/250 [11:12<00:09, 2.34s/it]
Training 1/1 epoch (loss 2.7089): 98%|ββββββββββ| 246/250 [11:14<00:09, 2.34s/it]
Training 1/1 epoch (loss 2.7089): 99%|ββββββββββ| 247/250 [11:14<00:07, 2.42s/it]
Training 1/1 epoch (loss 2.7895): 99%|ββββββββββ| 247/250 [11:17<00:07, 2.42s/it]
Training 1/1 epoch (loss 2.7895): 99%|ββββββββββ| 248/250 [11:17<00:05, 2.66s/it]
Training 1/1 epoch (loss 2.7471): 99%|ββββββββββ| 248/250 [11:20<00:05, 2.66s/it]
Training 1/1 epoch (loss 2.7471): 100%|ββββββββββ| 249/250 [11:20<00:02, 2.50s/it]
Training 1/1 epoch (loss 2.8230): 100%|ββββββββββ| 249/250 [11:22<00:02, 2.50s/it]
Training 1/1 epoch (loss 2.8230): 100%|ββββββββββ| 250/250 [11:22<00:00, 2.60s/it]
Training 1/1 epoch (loss 2.8230): 100%|ββββββββββ| 250/250 [11:22<00:00, 2.73s/it] |
| chat template saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-2000/chat_template.jinja |
| tokenizer config file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-2000/tokenizer_config.json |
| Special tokens file saved in /aifs4su/hansirui_1st/jiayi/setting3-imdb/Qwen1.5-4B/Qwen1.5-4B-s3-Q1-2000/special_tokens_map.json |
| wandb: ERROR Problem finishing run |
| Exception ignored in atexit callback: <bound method rank_zero_only.<locals>.wrapper of <safe_rlhf.logger.Logger object at 0x1550cc160210>> |
| Traceback (most recent call last): |
| File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/utils.py", line 212, in wrapper |
| return func(*args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^ |
| File "/home/hansirui_1st/jiayi/resist/setting3/safe_rlhf/logger.py", line 183, in close |
| self.wandb.finish() |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 503, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 451, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2309, in finish |
| return self._finish(exit_code) |
| ^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 406, in wrapper |
| return func(self, *args, **kwargs) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2337, in _finish |
| self._atexit_cleanup(exit_code=exit_code) |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2550, in _atexit_cleanup |
| self._on_finish() |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/wandb_run.py", line 2806, in _on_finish |
| wait_with_progress( |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 24, in wait_with_progress |
| return wait_all_with_progress( |
| ^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/mailbox/wait_with_progress.py", line 87, in wait_all_with_progress |
| return asyncio_compat.run(progress_loop_with_timeout) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/site-packages/wandb/sdk/lib/asyncio_compat.py", line 27, in run |
| future = executor.submit(runner.run, fn) |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
| File "/aifs4su/hansirui_1st/miniconda3/envs/jy-resist/lib/python3.11/concurrent/futures/thread.py", line 169, in submit |
| raise RuntimeError( |
| RuntimeError: cannot schedule new futures after interpreter shutdown |
|
|